Install and load required packages, if needed.
if(!require('tidyverse')){install.packages('tidyverse')} # For factor collapsing
library(tidyverse)
if(!require('ggplot2')){install.packages('ggplot2')} # For visualizations
library(ggplot2)
if(!require('gridExtra')){install.packages('gridExtra')} # For visualizations
library(gridExtra)
if(!require('corrplot')){install.packages('corrplot')} # For visualizations
library(corrplot)
if(!require('Hmisc')){install.packages('Hmisc')} # For rcorr function
library(Hmisc)
if(!require('ggcorrplot')){install.packages('ggcorrplot')} # For correlation heatmap
library(ggcorrplot)
if(!require('dplyr')){install.packages('dplyr')} # For grouping
library(dplyr)
if(!require('mltools')){install.packages('mltools')} # To create dummy variables
library(mltools)
if(!require('data.table')){install.packages('data.table')} # To create dummy variables
library(data.table)
First import the raw data, and save it to the object db.raw
db.raw <- read.csv('/Users/amyhowe/Desktop/diabetic_data.csv')
Let’s take a look at the structure of the dataset, and identify if there are any duplicated rows or “NA” values.
str(db.raw) # There are 50 variables and 101766 observations
## 'data.frame': 101766 obs. of 50 variables:
## $ encounter_id : int 2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
## $ patient_nbr : int 8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
## $ race : Factor w/ 6 levels "?","AfricanAmerican",..: 4 4 2 4 4 4 4 4 4 4 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 1 2 2 2 2 2 1 1 ...
## $ age : Factor w/ 10 levels "[0-10)","[10-20)",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ weight : Factor w/ 10 levels "?","[0-25)","[100-125)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admission_type_id : int 6 1 1 1 1 2 3 1 2 3 ...
## $ discharge_disposition_id: int 25 1 1 1 1 1 1 1 1 3 ...
## $ admission_source_id : int 1 7 7 7 7 2 2 7 4 4 ...
## $ time_in_hospital : int 1 3 2 2 1 3 4 5 13 12 ...
## $ payer_code : Factor w/ 18 levels "?","BC","CH",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ medical_specialty : Factor w/ 73 levels "?","AllergyandImmunology",..: 39 1 1 1 1 1 1 1 1 20 ...
## $ num_lab_procedures : int 41 59 11 44 51 31 70 73 68 33 ...
## $ num_procedures : int 0 0 5 1 0 6 1 0 2 3 ...
## $ num_medications : int 1 18 13 16 8 16 21 12 28 18 ...
## $ number_outpatient : int 0 0 2 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_inpatient : int 0 0 1 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 717 levels "?","10","11",..: 126 145 456 556 56 265 265 278 254 284 ...
## $ diag_2 : Factor w/ 749 levels "?","11","110",..: 1 81 80 99 26 248 248 316 262 48 ...
## $ diag_3 : Factor w/ 790 levels "?","11","110",..: 1 123 768 250 88 88 772 88 231 319 ...
## $ number_diagnoses : int 1 9 6 7 5 9 7 8 8 8 ...
## $ max_glu_serum : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ A1Cresult : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
## $ repaglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ nateglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ chlorpropamide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glimepiride : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 2 2 ...
## $ acetohexamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glipizide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 3 2 3 2 2 2 3 2 ...
## $ glyburide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 3 2 2 ...
## $ tolbutamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ pioglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ rosiglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 3 ...
## $ acarbose : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ miglitol : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ troglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ tolazamide : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ examide : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ citoglipton : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 2 4 2 4 3 3 3 2 3 3 ...
## $ glyburide.metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glipizide.metformin : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.pioglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ change : Factor w/ 2 levels "Ch","No": 2 1 2 1 1 2 1 2 1 1 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ readmitted : Factor w/ 3 levels "<30",">30","NO": 3 2 3 3 3 2 3 2 3 3 ...
length(subset(duplicated(db.raw), duplicated(db.raw) == TRUE)) # The output is 0, indicating there are no fully duplicated rows
## [1] 0
sum(is.na(db.raw)) # There are no "NA" values in entire dataset
## [1] 0
Now we will convert attributes to the correct data types, and add in any missing level names.
db.init <- db.raw # Put the dataset in a new object for initial analysis and cleaning
sapply(db.init[,c(3:6,11:12,23:50)], levels) # The levels of all factor attributes except for the diagnoses were checked here for overlap - none present
## $race
## [1] "?" "AfricanAmerican" "Asian" "Caucasian"
## [5] "Hispanic" "Other"
##
## $gender
## [1] "Female" "Male" "Unknown/Invalid"
##
## $age
## [1] "[0-10)" "[10-20)" "[20-30)" "[30-40)" "[40-50)" "[50-60)"
## [7] "[60-70)" "[70-80)" "[80-90)" "[90-100)"
##
## $weight
## [1] "?" "[0-25)" "[100-125)" "[125-150)" "[150-175)" "[175-200)"
## [7] "[25-50)" "[50-75)" "[75-100)" ">200"
##
## $payer_code
## [1] "?" "BC" "CH" "CM" "CP" "DM" "FR" "HM" "MC" "MD" "MP" "OG" "OT" "PO" "SI"
## [16] "SP" "UN" "WC"
##
## $medical_specialty
## [1] "?"
## [2] "AllergyandImmunology"
## [3] "Anesthesiology"
## [4] "Anesthesiology-Pediatric"
## [5] "Cardiology"
## [6] "Cardiology-Pediatric"
## [7] "DCPTEAM"
## [8] "Dentistry"
## [9] "Dermatology"
## [10] "Emergency/Trauma"
## [11] "Endocrinology"
## [12] "Endocrinology-Metabolism"
## [13] "Family/GeneralPractice"
## [14] "Gastroenterology"
## [15] "Gynecology"
## [16] "Hematology"
## [17] "Hematology/Oncology"
## [18] "Hospitalist"
## [19] "InfectiousDiseases"
## [20] "InternalMedicine"
## [21] "Nephrology"
## [22] "Neurology"
## [23] "Neurophysiology"
## [24] "Obsterics&Gynecology-GynecologicOnco"
## [25] "Obstetrics"
## [26] "ObstetricsandGynecology"
## [27] "Oncology"
## [28] "Ophthalmology"
## [29] "Orthopedics"
## [30] "Orthopedics-Reconstructive"
## [31] "Osteopath"
## [32] "Otolaryngology"
## [33] "OutreachServices"
## [34] "Pathology"
## [35] "Pediatrics"
## [36] "Pediatrics-AllergyandImmunology"
## [37] "Pediatrics-CriticalCare"
## [38] "Pediatrics-EmergencyMedicine"
## [39] "Pediatrics-Endocrinology"
## [40] "Pediatrics-Hematology-Oncology"
## [41] "Pediatrics-InfectiousDiseases"
## [42] "Pediatrics-Neurology"
## [43] "Pediatrics-Pulmonology"
## [44] "Perinatology"
## [45] "PhysicalMedicineandRehabilitation"
## [46] "PhysicianNotFound"
## [47] "Podiatry"
## [48] "Proctology"
## [49] "Psychiatry"
## [50] "Psychiatry-Addictive"
## [51] "Psychiatry-Child/Adolescent"
## [52] "Psychology"
## [53] "Pulmonology"
## [54] "Radiologist"
## [55] "Radiology"
## [56] "Resident"
## [57] "Rheumatology"
## [58] "Speech"
## [59] "SportsMedicine"
## [60] "Surgeon"
## [61] "Surgery-Cardiovascular"
## [62] "Surgery-Cardiovascular/Thoracic"
## [63] "Surgery-Colon&Rectal"
## [64] "Surgery-General"
## [65] "Surgery-Maxillofacial"
## [66] "Surgery-Neuro"
## [67] "Surgery-Pediatric"
## [68] "Surgery-Plastic"
## [69] "Surgery-PlasticwithinHeadandNeck"
## [70] "Surgery-Thoracic"
## [71] "Surgery-Vascular"
## [72] "SurgicalSpecialty"
## [73] "Urology"
##
## $max_glu_serum
## [1] ">200" ">300" "None" "Norm"
##
## $A1Cresult
## [1] ">7" ">8" "None" "Norm"
##
## $metformin
## [1] "Down" "No" "Steady" "Up"
##
## $repaglinide
## [1] "Down" "No" "Steady" "Up"
##
## $nateglinide
## [1] "Down" "No" "Steady" "Up"
##
## $chlorpropamide
## [1] "Down" "No" "Steady" "Up"
##
## $glimepiride
## [1] "Down" "No" "Steady" "Up"
##
## $acetohexamide
## [1] "No" "Steady"
##
## $glipizide
## [1] "Down" "No" "Steady" "Up"
##
## $glyburide
## [1] "Down" "No" "Steady" "Up"
##
## $tolbutamide
## [1] "No" "Steady"
##
## $pioglitazone
## [1] "Down" "No" "Steady" "Up"
##
## $rosiglitazone
## [1] "Down" "No" "Steady" "Up"
##
## $acarbose
## [1] "Down" "No" "Steady" "Up"
##
## $miglitol
## [1] "Down" "No" "Steady" "Up"
##
## $troglitazone
## [1] "No" "Steady"
##
## $tolazamide
## [1] "No" "Steady" "Up"
##
## $examide
## [1] "No"
##
## $citoglipton
## [1] "No"
##
## $insulin
## [1] "Down" "No" "Steady" "Up"
##
## $glyburide.metformin
## [1] "Down" "No" "Steady" "Up"
##
## $glipizide.metformin
## [1] "No" "Steady"
##
## $glimepiride.pioglitazone
## [1] "No" "Steady"
##
## $metformin.rosiglitazone
## [1] "No" "Steady"
##
## $metformin.pioglitazone
## [1] "No" "Steady"
##
## $change
## [1] "Ch" "No"
##
## $diabetesMed
## [1] "No" "Yes"
##
## $readmitted
## [1] "<30" ">30" "NO"
db.init$weight <- factor(db.init$weight, levels = c("?", "[0-25)", "[25-50)", "[50-75)", "[75-100)", "[100-125)", "[125-150)", "[150-175)", "[175-200)", ">200")) # Reorder the factor levels for weight to represent true ordering
# Need to convert admission_type_id, discharge_disposition_id and admission_source_id each to a factor and add in level names (based on supplementary id_mapping document)
table(db.init$admission_type_id) # Confirmed that all levels (1-8) are present and each have cases
##
## 1 2 3 4 5 6 7 8
## 53990 18480 18869 10 4785 5291 21 320
db.init$admission_type_id <- as.factor(db.init$admission_type_id)
levels(db.init$admission_type_id) <- c('Emergency', 'Urgent', 'Elective', 'Newborn', 'NotAvailable', 'NULL', 'TraumaCenter', 'NotMapped')
table(db.init$discharge_disposition_id) # Categories 21, 26, 29, and 30 are missing when compared to id_mapping doc; missing because 0 cases; will not add level names for them
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 60234 2128 13954 815 1184 12902 623 108 21 6 1642 3 399
## 14 15 16 17 18 19 20 22 23 24 25 27 28
## 372 63 11 14 3691 8 2 1993 412 48 989 5 139
db.init$discharge_disposition_id <- as.factor(db.init$discharge_disposition_id)
levels(db.init$discharge_disposition_id) <- c('DcHome', 'DcAnotherSTHospital', 'DcSNF', 'DcICF', 'DcOtherTypeInpatient', 'DcHomeWithHomeService', 'LeftAMA', 'DcHomeWithIVCare', 'AdmittedInpatientThisHospital', 'NeonateDcOtherHospital', 'Expired', 'StillPatientOrOutpatient', 'HospiceHome', 'HospiceFacility', 'DcWithinSwingBed', 'RefOtherOutpatient', 'RefSameOutpatient', 'NULL', 'ExpiredHome', 'ExpiredFacility', 'DcRehab', 'DcLTCHospital', 'DcMedicaidNursingFacility', 'NotMapped', 'DcFederalHealthFacility','DcPsychiatric')
table(db.init$admission_source_id) # Categories 12, 15, 18, 19, 21, 23, 24, and 26 are missing when compared to id_mapping doc. Note: there is no category 16 in the id_mapping doc, nor is one present in the data
##
## 1 2 3 4 5 6 7 8 9 10 11 13 14
## 29565 1104 187 3187 855 2264 57494 16 125 8 2 1 2
## 17 20 22 25
## 6781 161 12 2
db.init$admission_source_id <- as.factor(db.init$admission_source_id)
levels(db.init$admission_source_id) <- c('PhysRef', 'ClinicRef', 'HMORef', 'TransferFromHospital', 'TransferFromSNF', 'TransferFromOtherHealthFacility', 'ER', 'CourtOrLawEnf', 'NotAvailable', 'TransferFromCriticalAccessHospital', 'NormalDelivery', 'SickBaby', 'ExtramuralBirth', 'NULL', 'NotMapped', 'SepClaim', 'TransferFromAmbSurg')
# No need to check specifically for overlap in levels of diagnoses, as these are all ICD-9-CM codes. Just skimmed the levels to ensure the levels were either ICD codes or '?'
levels(db.init$diag_1)
## [1] "?" "10" "11" "110" "112" "114" "115" "117"
## [9] "131" "133" "135" "136" "141" "142" "143" "145"
## [17] "146" "147" "148" "149" "150" "151" "152" "153"
## [25] "154" "155" "156" "157" "158" "160" "161" "162"
## [33] "163" "164" "170" "171" "172" "173" "174" "175"
## [41] "179" "180" "182" "183" "184" "185" "187" "188"
## [49] "189" "191" "192" "193" "194" "195" "196" "197"
## [57] "198" "199" "200" "201" "202" "203" "204" "205"
## [65] "207" "208" "210" "211" "212" "214" "215" "216"
## [73] "217" "218" "219" "220" "223" "225" "226" "227"
## [81] "228" "229" "23" "230" "233" "235" "236" "237"
## [89] "238" "239" "240" "241" "242" "244" "245" "246"
## [97] "250" "250.01" "250.02" "250.03" "250.1" "250.11" "250.12" "250.13"
## [105] "250.2" "250.21" "250.22" "250.23" "250.3" "250.31" "250.32" "250.33"
## [113] "250.4" "250.41" "250.42" "250.43" "250.5" "250.51" "250.52" "250.53"
## [121] "250.6" "250.7" "250.8" "250.81" "250.82" "250.83" "250.9" "250.91"
## [129] "250.92" "250.93" "251" "252" "253" "255" "261" "262"
## [137] "263" "266" "27" "271" "272" "273" "274" "275"
## [145] "276" "277" "278" "279" "280" "281" "282" "283"
## [153] "284" "285" "286" "287" "288" "289" "290" "291"
## [161] "292" "293" "294" "295" "296" "297" "298" "299"
## [169] "3" "300" "301" "303" "304" "305" "306" "307"
## [177] "308" "309" "31" "310" "311" "312" "314" "318"
## [185] "320" "322" "323" "324" "325" "327" "331" "332"
## [193] "333" "334" "335" "336" "337" "338" "34" "340"
## [201] "341" "342" "344" "345" "346" "347" "348" "349"
## [209] "35" "350" "351" "352" "353" "354" "355" "356"
## [217] "357" "358" "359" "36" "360" "361" "362" "363"
## [225] "365" "366" "368" "369" "370" "372" "373" "374"
## [233] "375" "376" "377" "378" "379" "38" "380" "381"
## [241] "382" "383" "384" "385" "386" "388" "389" "39"
## [249] "391" "394" "395" "396" "397" "398" "401" "402"
## [257] "403" "404" "405" "41" "410" "411" "412" "413"
## [265] "414" "415" "416" "417" "42" "420" "421" "422"
## [273] "423" "424" "425" "426" "427" "428" "429" "430"
## [281] "431" "432" "433" "434" "435" "436" "437" "438"
## [289] "440" "441" "442" "443" "444" "445" "446" "447"
## [297] "448" "451" "452" "453" "454" "455" "456" "457"
## [305] "458" "459" "461" "462" "463" "464" "465" "466"
## [313] "47" "470" "471" "473" "474" "475" "477" "478"
## [321] "48" "480" "481" "482" "483" "485" "486" "487"
## [329] "49" "490" "491" "492" "493" "494" "495" "496"
## [337] "5" "500" "501" "506" "507" "508" "510" "511"
## [345] "512" "513" "514" "515" "516" "518" "519" "52"
## [353] "521" "522" "523" "524" "526" "527" "528" "529"
## [361] "53" "530" "531" "532" "533" "534" "535" "536"
## [369] "537" "54" "540" "541" "542" "543" "550" "551"
## [377] "552" "553" "555" "556" "557" "558" "560" "562"
## [385] "564" "565" "566" "567" "568" "569" "57" "570"
## [393] "571" "572" "573" "574" "575" "576" "577" "578"
## [401] "579" "58" "580" "581" "582" "583" "584" "585"
## [409] "586" "588" "590" "591" "592" "593" "594" "595"
## [417] "596" "598" "599" "600" "601" "602" "603" "604"
## [425] "605" "607" "608" "61" "610" "611" "614" "615"
## [433] "616" "617" "618" "619" "620" "621" "622" "623"
## [441] "625" "626" "627" "632" "633" "634" "637" "640"
## [449] "641" "642" "643" "644" "645" "646" "647" "648"
## [457] "649" "652" "653" "654" "655" "656" "657" "658"
## [465] "659" "66" "660" "661" "663" "664" "665" "669"
## [473] "671" "674" "680" "681" "682" "683" "684" "685"
## [481] "686" "690" "691" "692" "693" "694" "695" "696"
## [489] "698" "7" "70" "700" "703" "704" "705" "706"
## [497] "707" "708" "709" "710" "711" "714" "715" "716"
## [505] "717" "718" "719" "720" "721" "722" "723" "724"
## [513] "725" "726" "727" "728" "729" "730" "731" "732"
## [521] "733" "734" "735" "736" "737" "738" "745" "746"
## [529] "747" "75" "751" "753" "756" "759" "78" "780"
## [537] "781" "782" "783" "784" "785" "786" "787" "788"
## [545] "789" "79" "790" "791" "792" "793" "794" "795"
## [553] "796" "797" "799" "8" "800" "801" "802" "803"
## [561] "804" "805" "806" "807" "808" "810" "812" "813"
## [569] "814" "815" "816" "817" "82" "820" "821" "822"
## [577] "823" "824" "825" "826" "827" "831" "832" "833"
## [585] "834" "835" "836" "837" "838" "839" "84" "840"
## [593] "842" "843" "844" "845" "846" "847" "848" "850"
## [601] "851" "852" "853" "854" "860" "861" "862" "863"
## [609] "864" "865" "866" "867" "868" "870" "871" "873"
## [617] "875" "878" "879" "88" "880" "881" "882" "883"
## [625] "885" "886" "890" "891" "892" "893" "895" "897"
## [633] "9" "903" "904" "906" "911" "913" "914" "915"
## [641] "916" "917" "919" "920" "921" "922" "923" "924"
## [649] "928" "933" "934" "935" "936" "939" "94" "941"
## [657] "942" "944" "945" "952" "955" "957" "958" "959"
## [665] "962" "963" "964" "965" "966" "967" "968" "969"
## [673] "97" "970" "971" "972" "973" "974" "975" "976"
## [681] "977" "98" "980" "982" "983" "986" "987" "988"
## [689] "989" "990" "991" "992" "994" "995" "996" "997"
## [697] "998" "999" "E909" "V07" "V25" "V26" "V43" "V45"
## [705] "V51" "V53" "V54" "V55" "V56" "V57" "V58" "V60"
## [713] "V63" "V66" "V67" "V70" "V71"
levels(db.init$diag_2)
## [1] "?" "11" "110" "111" "112" "114" "115" "117"
## [9] "123" "130" "131" "135" "136" "137" "138" "140"
## [17] "141" "145" "150" "151" "152" "153" "154" "155"
## [25] "156" "157" "162" "163" "164" "171" "172" "173"
## [33] "174" "179" "180" "182" "183" "185" "186" "188"
## [41] "189" "191" "192" "193" "195" "196" "197" "198"
## [49] "199" "200" "201" "202" "203" "204" "205" "208"
## [57] "211" "212" "214" "215" "217" "218" "220" "223"
## [65] "225" "226" "227" "228" "232" "233" "235" "238"
## [73] "239" "240" "241" "242" "244" "245" "246" "250"
## [81] "250.01" "250.02" "250.03" "250.1" "250.11" "250.12" "250.13" "250.2"
## [89] "250.21" "250.22" "250.23" "250.3" "250.31" "250.32" "250.33" "250.4"
## [97] "250.41" "250.42" "250.43" "250.5" "250.51" "250.52" "250.53" "250.6"
## [105] "250.7" "250.8" "250.81" "250.82" "250.83" "250.9" "250.91" "250.92"
## [113] "250.93" "251" "252" "253" "255" "256" "258" "259"
## [121] "260" "261" "262" "263" "266" "268" "269" "27"
## [129] "270" "271" "272" "273" "274" "275" "276" "277"
## [137] "278" "279" "280" "281" "282" "283" "284" "285"
## [145] "286" "287" "288" "289" "290" "291" "292" "293"
## [153] "294" "295" "296" "297" "298" "299" "300" "301"
## [161] "302" "303" "304" "305" "306" "307" "308" "309"
## [169] "31" "310" "311" "312" "314" "316" "317" "318"
## [177] "319" "320" "322" "323" "324" "325" "327" "331"
## [185] "332" "333" "335" "336" "337" "338" "34" "340"
## [193] "341" "342" "343" "344" "345" "346" "347" "348"
## [201] "349" "35" "350" "351" "352" "353" "354" "355"
## [209] "356" "357" "358" "359" "360" "362" "364" "365"
## [217] "366" "368" "369" "372" "373" "374" "376" "377"
## [225] "378" "379" "38" "380" "381" "382" "383" "386"
## [233] "388" "389" "394" "395" "396" "397" "398" "40"
## [241] "401" "402" "403" "404" "405" "41" "410" "411"
## [249] "412" "413" "414" "415" "416" "42" "420" "421"
## [257] "422" "423" "424" "425" "426" "427" "428" "429"
## [265] "430" "431" "432" "433" "434" "435" "436" "437"
## [273] "438" "440" "441" "442" "443" "444" "446" "447"
## [281] "448" "451" "452" "453" "454" "455" "456" "457"
## [289] "458" "459" "46" "460" "461" "462" "463" "464"
## [297] "465" "466" "470" "472" "473" "474" "475" "477"
## [305] "478" "480" "481" "482" "483" "484" "485" "486"
## [313] "487" "490" "491" "492" "493" "494" "495" "496"
## [321] "5" "500" "501" "506" "507" "508" "510" "511"
## [329] "512" "513" "514" "515" "516" "517" "518" "519"
## [337] "52" "520" "521" "522" "523" "524" "527" "528"
## [345] "529" "53" "530" "531" "532" "533" "534" "535"
## [353] "536" "537" "54" "540" "542" "543" "550" "552"
## [361] "553" "555" "556" "557" "558" "560" "562" "564"
## [369] "565" "566" "567" "568" "569" "570" "571" "572"
## [377] "573" "574" "575" "576" "577" "578" "579" "580"
## [385] "581" "583" "584" "585" "586" "588" "590" "591"
## [393] "592" "593" "594" "595" "596" "598" "599" "600"
## [401] "601" "602" "603" "604" "605" "607" "608" "610"
## [409] "611" "614" "615" "616" "617" "618" "619" "620"
## [417] "621" "622" "623" "625" "626" "627" "634" "641"
## [425] "642" "644" "645" "646" "647" "648" "649" "652"
## [433] "654" "656" "658" "659" "66" "661" "663" "664"
## [441] "665" "670" "674" "680" "681" "682" "683" "684"
## [449] "685" "686" "691" "692" "693" "694" "695" "696"
## [457] "698" "7" "70" "701" "702" "703" "704" "705"
## [465] "706" "707" "709" "710" "711" "712" "713" "714"
## [473] "715" "716" "717" "718" "719" "721" "722" "723"
## [481] "724" "725" "726" "727" "728" "729" "730" "731"
## [489] "733" "734" "736" "737" "738" "741" "742" "745"
## [497] "746" "747" "748" "75" "750" "751" "752" "753"
## [505] "754" "755" "756" "758" "759" "78" "780" "781"
## [513] "782" "783" "784" "785" "786" "787" "788" "789"
## [521] "79" "790" "791" "792" "793" "794" "795" "796"
## [529] "797" "799" "8" "800" "801" "802" "805" "806"
## [537] "807" "808" "810" "811" "812" "813" "814" "815"
## [545] "816" "820" "821" "822" "823" "824" "825" "826"
## [553] "831" "832" "833" "836" "837" "840" "842" "843"
## [561] "844" "845" "846" "847" "850" "851" "852" "853"
## [569] "860" "861" "862" "863" "864" "865" "866" "867"
## [577] "868" "869" "870" "871" "872" "873" "879" "88"
## [585] "880" "881" "882" "883" "884" "891" "892" "893"
## [593] "894" "9" "905" "906" "907" "908" "909" "910"
## [601] "911" "912" "913" "915" "916" "917" "918" "919"
## [609] "920" "921" "922" "923" "924" "927" "933" "934"
## [617] "94" "942" "944" "945" "947" "948" "952" "953"
## [625] "955" "958" "959" "96" "962" "963" "965" "967"
## [633] "968" "969" "972" "974" "975" "977" "980" "987"
## [641] "989" "99" "990" "991" "992" "994" "995" "996"
## [649] "997" "998" "999" "E812" "E813" "E814" "E816" "E817"
## [657] "E818" "E819" "E821" "E826" "E829" "E849" "E850" "E853"
## [665] "E854" "E858" "E868" "E870" "E878" "E879" "E880" "E881"
## [673] "E882" "E883" "E884" "E885" "E887" "E888" "E890" "E900"
## [681] "E905" "E906" "E915" "E916" "E917" "E918" "E919" "E924"
## [689] "E927" "E928" "E929" "E930" "E931" "E932" "E933" "E934"
## [697] "E935" "E936" "E937" "E938" "E939" "E941" "E942" "E944"
## [705] "E945" "E947" "E950" "E965" "E968" "E980" "V02" "V03"
## [713] "V08" "V09" "V10" "V11" "V12" "V13" "V14" "V15"
## [721] "V16" "V17" "V18" "V23" "V25" "V42" "V43" "V44"
## [729] "V45" "V46" "V49" "V50" "V53" "V54" "V55" "V57"
## [737] "V58" "V60" "V61" "V62" "V63" "V64" "V65" "V66"
## [745] "V69" "V70" "V72" "V85" "V86"
levels(db.init$diag_3)
## [1] "?" "11" "110" "111" "112" "115" "117" "122"
## [9] "123" "131" "132" "135" "136" "138" "139" "14"
## [17] "141" "146" "148" "150" "151" "152" "153" "154"
## [25] "155" "156" "157" "158" "161" "162" "163" "164"
## [33] "17" "170" "171" "172" "173" "174" "175" "179"
## [41] "180" "182" "183" "185" "186" "188" "189" "191"
## [49] "192" "193" "195" "196" "197" "198" "199" "200"
## [57] "201" "202" "203" "204" "205" "208" "211" "214"
## [65] "215" "216" "217" "218" "220" "223" "225" "226"
## [73] "227" "228" "230" "233" "235" "236" "238" "239"
## [81] "240" "241" "242" "243" "244" "245" "246" "250"
## [89] "250.01" "250.02" "250.03" "250.1" "250.11" "250.12" "250.13" "250.2"
## [97] "250.21" "250.22" "250.23" "250.3" "250.31" "250.4" "250.41" "250.42"
## [105] "250.43" "250.5" "250.51" "250.52" "250.53" "250.6" "250.7" "250.8"
## [113] "250.81" "250.82" "250.83" "250.9" "250.91" "250.92" "250.93" "251"
## [121] "252" "253" "255" "256" "258" "259" "260" "261"
## [129] "262" "263" "265" "266" "268" "27" "270" "271"
## [137] "272" "273" "274" "275" "276" "277" "278" "279"
## [145] "280" "281" "282" "283" "284" "285" "286" "287"
## [153] "288" "289" "290" "291" "292" "293" "294" "295"
## [161] "296" "297" "298" "299" "3" "300" "301" "303"
## [169] "304" "305" "306" "307" "308" "309" "310" "311"
## [177] "312" "313" "314" "315" "317" "318" "319" "323"
## [185] "327" "331" "332" "333" "334" "335" "336" "337"
## [193] "338" "34" "340" "341" "342" "343" "344" "345"
## [201] "346" "347" "348" "349" "35" "350" "351" "353"
## [209] "354" "355" "356" "357" "358" "359" "360" "361"
## [217] "362" "365" "365.44" "366" "368" "369" "370" "372"
## [225] "373" "374" "376" "377" "378" "379" "38" "380"
## [233] "381" "382" "383" "384" "385" "386" "387" "388"
## [241] "389" "391" "394" "395" "396" "397" "398" "401"
## [249] "402" "403" "404" "405" "41" "410" "411" "412"
## [257] "413" "414" "415" "416" "417" "42" "420" "421"
## [265] "423" "424" "425" "426" "427" "428" "429" "430"
## [273] "431" "432" "433" "434" "435" "436" "437" "438"
## [281] "440" "441" "442" "443" "444" "445" "446" "447"
## [289] "448" "451" "452" "453" "454" "455" "456" "457"
## [297] "458" "459" "460" "461" "462" "463" "464" "465"
## [305] "466" "47" "470" "472" "473" "475" "477" "478"
## [313] "480" "481" "482" "483" "484" "485" "486" "487"
## [321] "49" "490" "491" "492" "493" "494" "495" "496"
## [329] "5" "500" "501" "506" "507" "508" "510" "511"
## [337] "512" "514" "515" "516" "517" "518" "519" "521"
## [345] "522" "523" "524" "525" "527" "528" "529" "53"
## [353] "530" "531" "532" "533" "534" "535" "536" "537"
## [361] "538" "54" "540" "542" "543" "550" "552" "553"
## [369] "555" "556" "557" "558" "560" "562" "564" "565"
## [377] "566" "567" "568" "569" "57" "570" "571" "572"
## [385] "573" "574" "575" "576" "577" "578" "579" "580"
## [393] "581" "582" "583" "584" "585" "586" "588" "590"
## [401] "591" "592" "593" "594" "595" "596" "597" "598"
## [409] "599" "600" "601" "602" "603" "604" "605" "607"
## [417] "608" "610" "611" "614" "616" "617" "618" "619"
## [425] "620" "621" "622" "623" "624" "625" "626" "627"
## [433] "641" "642" "643" "644" "646" "647" "648" "649"
## [441] "652" "653" "654" "655" "656" "657" "658" "659"
## [449] "66" "660" "661" "663" "664" "665" "669" "670"
## [457] "671" "674" "680" "681" "682" "684" "685" "686"
## [465] "690" "692" "693" "694" "695" "696" "697" "698"
## [473] "7" "70" "701" "702" "703" "704" "705" "706"
## [481] "707" "708" "709" "710" "711" "712" "713" "714"
## [489] "715" "716" "717" "718" "719" "720" "721" "722"
## [497] "723" "724" "725" "726" "727" "728" "729" "730"
## [505] "731" "732" "733" "734" "735" "736" "737" "738"
## [513] "741" "742" "744" "745" "746" "747" "75" "750"
## [521] "751" "752" "753" "754" "755" "756" "757" "758"
## [529] "759" "78" "780" "781" "782" "783" "784" "785"
## [537] "786" "787" "788" "789" "79" "790" "791" "792"
## [545] "793" "794" "795" "796" "797" "799" "8" "800"
## [553] "801" "802" "805" "807" "808" "810" "811" "812"
## [561] "813" "814" "815" "816" "820" "821" "822" "823"
## [569] "824" "825" "826" "831" "834" "836" "837" "838"
## [577] "840" "841" "842" "844" "845" "847" "848" "850"
## [585] "851" "852" "853" "854" "860" "861" "862" "863"
## [593] "864" "865" "866" "867" "868" "870" "871" "872"
## [601] "873" "875" "876" "877" "879" "88" "880" "881"
## [609] "882" "883" "884" "890" "891" "892" "893" "9"
## [617] "905" "906" "907" "908" "909" "910" "911" "912"
## [625] "913" "915" "916" "917" "918" "919" "920" "921"
## [633] "922" "923" "924" "928" "930" "933" "934" "935"
## [641] "94" "942" "943" "944" "945" "948" "951" "952"
## [649] "953" "955" "956" "958" "959" "962" "965" "966"
## [657] "967" "969" "970" "971" "972" "980" "987" "989"
## [665] "991" "992" "995" "996" "997" "998" "999" "E812"
## [673] "E813" "E815" "E816" "E817" "E818" "E819" "E822" "E825"
## [681] "E826" "E828" "E849" "E850" "E852" "E853" "E854" "E855"
## [689] "E858" "E861" "E864" "E865" "E870" "E876" "E878" "E879"
## [697] "E880" "E881" "E882" "E883" "E884" "E885" "E886" "E887"
## [705] "E888" "E892" "E894" "E900" "E901" "E904" "E905" "E906"
## [713] "E912" "E915" "E916" "E917" "E919" "E920" "E922" "E924"
## [721] "E927" "E928" "E929" "E930" "E931" "E932" "E933" "E934"
## [729] "E935" "E936" "E937" "E938" "E939" "E941" "E942" "E943"
## [737] "E944" "E945" "E946" "E947" "E949" "E950" "E955" "E956"
## [745] "E965" "E966" "E980" "E987" "V01" "V02" "V03" "V06"
## [753] "V07" "V08" "V09" "V10" "V11" "V12" "V13" "V14"
## [761] "V15" "V16" "V17" "V18" "V22" "V23" "V25" "V27"
## [769] "V42" "V43" "V44" "V45" "V46" "V49" "V53" "V54"
## [777] "V55" "V57" "V58" "V60" "V61" "V62" "V63" "V64"
## [785] "V65" "V66" "V70" "V72" "V85" "V86"
All remaining attributes are the correct data types.
If there are multiple encounters for the same patient, we want to remove any after the first one. This is because we want to ensure observations are independent, so that we may run statistical tests on them during analysis and not violate the assumption of independence. We are interested in the the first encounter of each patient, as we are focused on early identification of the risks of readmission.
Additionally, cases where the patient died, or were sent to hospice for end-of-life care should be excluded, as there is no chance of readmission.
length(unique(db.init$encounter_id)) # There are 101766 unique encounters, which is the total number of observations in the dataset
## [1] 101766
length(unique(db.init$patient_nbr)) # There are only 71518 unique patient IDs, indicating that a number of patients have 2 or more encounters
## [1] 71518
db.init <- db.init[order(db.init$encounter_id),] # Sort patient encounters in ascending order; lower numbers indicate earlier encounters
db.init <- db.init[!duplicated(db.init$patient_nbr), ] # Remove rows with duplicated patient IDs; the first instance (with the lower encounter number) will be kept, with any later instances removed
dim(db.init) # Confirmed the resulting data frame has 50 attributes and 71518 observations, as expected
## [1] 71518 50
db.init <- subset(db.init, discharge_disposition_id != 'Expired' & discharge_disposition_id != 'ExpiredHome' & discharge_disposition_id != 'ExpiredFacility' & discharge_disposition_id != 'HospiceHome' & discharge_disposition_id != 'HospiceFacility') # Remove cases where the patient died or was sent to hospice
dim(db.init) # There are now 69973 observations for 50 attributes. Note that in the original research article, they had the same criteria for case removal, but were left with 69984 cases (difference of 11 cases) - unsure at this time as to the reason
## [1] 69973 50
Now that all attributes have the correct data types, let’s review the updated structure.
str(db.init)
## 'data.frame': 69973 obs. of 50 variables:
## $ encounter_id : int 12522 15738 16680 28236 35754 36900 40926 42570 55842 62256 ...
## $ patient_nbr : int 48330783 63555939 42519267 89869032 82637451 77391171 85504905 77586282 84259809 49726791 ...
## $ race : Factor w/ 6 levels "?","AfricanAmerican",..: 4 4 4 2 4 2 4 4 4 2 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 1 1 2 1 2 2 1 2 2 1 ...
## $ age : Factor w/ 10 levels "[0-10)","[10-20)",..: 9 10 5 5 6 7 5 9 7 7 ...
## $ weight : Factor w/ 10 levels "?","[0-25)","[25-50)",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admission_type_id : Factor w/ 8 levels "Emergency","Urgent",..: 2 3 1 1 2 2 1 1 3 3 ...
## $ discharge_disposition_id: Factor w/ 26 levels "DcHome","DcAnotherSTHospital",..: 1 3 1 1 1 1 3 6 1 1 ...
## $ admission_source_id : Factor w/ 17 levels "PhysRef","ClinicRef",..: 4 4 7 7 2 4 7 7 2 2 ...
## $ time_in_hospital : int 13 12 1 9 3 7 7 10 4 1 ...
## $ payer_code : Factor w/ 18 levels "?","BC","CH",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ medical_specialty : Factor w/ 73 levels "?","AllergyandImmunology",..: 1 20 1 1 1 1 13 13 1 1 ...
## $ num_lab_procedures : int 68 33 51 47 31 62 60 55 70 49 ...
## $ num_procedures : int 2 3 0 2 6 0 0 1 1 5 ...
## $ num_medications : int 28 18 8 17 16 11 15 31 21 2 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 1 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1 : Factor w/ 717 levels "?","10","11",..: 254 284 56 122 265 28 278 278 265 350 ...
## $ diag_2 : Factor w/ 749 levels "?","11","110",..: 262 48 26 243 248 147 99 248 248 650 ...
## $ diag_3 : Factor w/ 790 levels "?","11","110",..: 231 319 88 668 88 53 110 269 772 432 ...
## $ number_diagnoses : int 8 8 5 9 9 7 8 8 7 8 ...
## $ max_glu_serum : Factor w/ 4 levels ">200",">300",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ A1Cresult : Factor w/ 4 levels ">7",">8","None",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 3 2 ...
## $ repaglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 4 2 2 2 ...
## $ nateglinide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ chlorpropamide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glimepiride : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 3 2 ...
## $ acetohexamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glipizide : Factor w/ 4 levels "Down","No","Steady",..: 3 2 3 2 2 2 2 2 2 2 ...
## $ glyburide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 4 2 2 2 2 ...
## $ tolbutamide : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ pioglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ rosiglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 3 2 2 2 2 2 2 2 2 ...
## $ acarbose : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ miglitol : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ troglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ tolazamide : Factor w/ 3 levels "No","Steady",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ examide : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ citoglipton : Factor w/ 1 level "No": 1 1 1 1 1 1 1 1 1 1 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 3 3 3 3 3 3 1 3 3 3 ...
## $ glyburide.metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ glipizide.metformin : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ glimepiride.pioglitazone: Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.rosiglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ metformin.pioglitazone : Factor w/ 2 levels "No","Steady": 1 1 1 1 1 1 1 1 1 1 ...
## $ change : Factor w/ 2 levels "Ch","No": 1 1 1 2 2 1 1 2 1 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ readmitted : Factor w/ 3 levels "<30",">30","NO": 3 3 3 2 2 1 1 3 3 2 ...
First we will create a custom function to calculate Tukey fences so that we may identify outliers.
fences <- function(x){
y1 <- unname(summary(x)[2]) - 1.5*(IQR(x))
y2 <- unname(summary(x)[5]) + 1.5*(IQR(x))
z <- c(y1, y2)
return(z)
}
Then we will go through each numeric attribute individually to review the summary statistics, distribution, and outliers. We already know there are no missing values for the numeric attributes.
Encounter ID and Patient Number
length(unique(db.init$patient_nbr)) # Now that the number of unique patient ids matches the number of total observations (69973), there is no further need for the patient number or encounter id attributes. They are not relevant to any analysis, and will be removed for model creation
## [1] 69973
Time in Hospital
summary(db.init$time_in_hospital) # Ranges from 1-14, skewed to the right with a median of 3, mean of 4.273
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.273 6.000 14.000
sd(db.init$time_in_hospital) # Standard deviation of 2.934
## [1] 2.933924
boxplot(db.init$time_in_hospital, horizontal = T, col = '#83afe6', xlab = 'Time in Hospital') # Visible outliers above upper fence
fences(db.init$time_in_hospital) # Lower fence is -4 (below the min), and upper fence is 12
## [1] -4 12
length(subset(db.init$time_in_hospital, db.init$time_in_hospital > 12)) # 1403 records have a length of stay higher than the upper fence
## [1] 1403
hist.time_in_hospital <- ggplot(db.init, aes(time_in_hospital)) + geom_histogram(binwidth = 1, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'Time in Hospital', y = 'Count') + theme(axis.title.y = element_blank())
hist.time_in_hospital # Distribution is visibly skewed to the right
Number of Lab Procedures
summary(db.init$num_lab_procedures) # Ranges from 1-132, with a median of 44, mean of 42.88
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 31.00 44.00 42.88 57.00 132.00
sd(db.init$num_lab_procedures) # Standard deviation of 19.895
## [1] 19.89453
boxplot(db.init$num_lab_procedures, horizontal = T, col = '#83afe6', xlab = 'No. Lab Procedures') # Visible outliers above upper fence
fences(db.init$num_lab_procedures) # Lower fence is -8 (below the min), and upper fence is 96
## [1] -8 96
length(subset(db.init$num_lab_procedures, db.init$num_lab_procedures > 96)) # 98 records have a number of lab procedures higher than the upper fence
## [1] 98
hist.num_lab_procedures <- ggplot(db.init, aes(num_lab_procedures)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Lab Procedures', y = 'Count') + theme(axis.title.y = element_blank())
hist.num_lab_procedures # Distribution is unimodal except for large number of values at 1, with outliers visible to the right
length(subset(db.init$num_lab_procedures, db.init$num_lab_procedures == 1)) # Of note, 2254 patients had only one lab procedure done
## [1] 2254
Number of Procedures
summary(db.init$num_procedures) # Ranges from 0-6, with a median of 1, mean of 1.426
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.426 2.000 6.000
sd(db.init$num_procedures) # Standard deviation of 1.757
## [1] 1.757131
boxplot(db.init$num_procedures, horizontal = T, col = '#83afe6', xlab = 'No. Procedures') # Visible outlier(s) above upper fence
fences(db.init$num_procedures) # Lower fence is -3 (below the min), and upper fence is 5
## [1] -3 5
length(subset(db.init$num_procedures, db.init$num_procedures > 5)) # 3845 records have a number of lab procedures higher than the upper fence. However, the only value above the upper fence is 6, so these are all patients who had 6 lab procedures done
## [1] 3845
hist.num_procedures <- ggplot(db.init, aes(num_procedures)) + geom_histogram(binwidth = 1, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Procedures', y = 'Count') + theme(axis.title.y = element_blank())
hist.num_procedures # Distribution is skewed right
Number of Medications
summary(db.init$num_medications) # Ranges from 1-81, with a median of 14, mean of 15.67
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 10.00 14.00 15.67 20.00 81.00
sd(db.init$num_medications) # Standard deviation of 8.287
## [1] 8.287246
boxplot(db.init$num_medications, horizontal = T, col = '#83afe6', xlab = 'No. Medications') # Many visible outliers above upper fence
fences(db.init$num_medications) # Lower fence is -5 (below the min), and upper fence is 35
## [1] -5 35
length(subset(db.init$num_medications, db.init$num_medications > 35)) # 1867 records have a number of medications higher than the upper fence
## [1] 1867
hist.num_medications <- ggplot(db.init, aes(num_medications)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Medications', y = 'Count') + theme(axis.title.y = element_blank())
hist.num_medications # Distribution is unimodal and skewed slightly right
Number of Outpatient Visits
summary(db.init$number_outpatient) # Ranges from 0-42, with a median of 0, mean of 0.2795
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2795 0.0000 42.0000
sd(db.init$number_outpatient) # Standard deviation of 1.064
## [1] 1.064035
boxplot(db.init$number_outpatient, horizontal = T, col = '#83afe6', xlab = 'No. Outpatient Visits') # Many visible outliers above upper fence
fences(db.init$number_outpatient) # Lower fence is 0, and upper fence is 0
## [1] 0 0
length(subset(db.init$number_outpatient, db.init$number_outpatient != 0)) # 9119 records have a number of outpatient visits higher than the upper fence. I.e., had more than 0 outpatient visits in the last year
## [1] 9119
hist.number_outpatient <- ggplot(db.init, aes(number_outpatient)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Outpatient Visits', y = 'Count') + theme(axis.title.y = element_blank())
hist.number_outpatient # Most cases are zero
table(db.init$number_outpatient) # The vast majority of cases (60854) are zero, and the rest of the data is skewed right
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 60854 4779 1981 1096 574 278 122 72 56 36 28 18 13
## 13 14 15 16 17 18 19 20 21 22 23 24 25
## 13 10 10 7 4 2 1 3 1 2 1 2 1
## 26 27 29 33 35 36 42
## 1 2 1 2 1 1 1
Number of Emergency Visits
summary(db.init$number_emergency) # Ranges from 0-42, with a median of 0, mean of 0.1039
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1039 0.0000 42.0000
sd(db.init$number_emergency) # Standard deviation of 0.512
## [1] 0.5118705
boxplot(db.init$number_emergency, horizontal = T, col = '#83afe6', xlab = 'No. Emergency Visits') # Many visible outliers above upper fence
fences(db.init$number_emergency) # Lower fence is 0, and upper fence is 0
## [1] 0 0
length(subset(db.init$number_emergency, db.init$number_emergency != 0)) # 5100 records have a number of emergency visits higher than the upper fence. I.e., had more than 0 emergency visits in the last year
## [1] 5100
hist.number_emergency <- ggplot(db.init, aes(number_emergency)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Emergency Visits', y = 'Count') + theme(axis.title.y = element_blank())
hist.number_emergency # Most cases are zero
table(db.init$number_emergency) # The vast majority of cases (64873) are zero, and the rest of the data is skewed right
##
## 0 1 2 3 4 5 6 7 8 9 10 11 13
## 64873 3882 789 242 95 32 26 8 9 4 5 2 1
## 16 20 25 37 42
## 1 1 1 1 1
Number of Inpatient Visits
summary(db.init$number_inpatient) # Ranges from 0-12, with a median of 0, mean of 0.1763
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1763 0.0000 12.0000
sd(db.init$number_inpatient) # Standard deviation of 0.602
## [1] 0.6016572
boxplot(db.init$number_inpatient, horizontal = T, col = '#83afe6', xlab = 'No. Inpatient Visits') # Visible outliers above upper fence
fences(db.init$number_inpatient) # Lower fence is 0, and upper fence is 0
## [1] 0 0
length(subset(db.init$number_inpatient, db.init$number_inpatient != 0)) # 8191 records have a number of inpatient visits higher than the upper fence. I.e., had more than 0 inpatient visits in the last year
## [1] 8191
hist.number_inpatient <- ggplot(db.init, aes(number_inpatient)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Inpatient Visits') + theme(axis.title.y = element_blank()) + theme(axis.title.y = element_blank())
hist.number_inpatient # Most casesa are zero
table(db.init$number_inpatient) # The vast majority of cases (61782) are zero, and the rest of the data is skewed right
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 61782 5794 1501 463 228 102 55 19 13 7 5 2 2
Number of Diagnoses
summary(db.init$number_diagnoses) # Ranges from 1-16, with a median of 8, mean of 7.224
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 6.000 8.000 7.224 9.000 16.000
sd(db.init$number_diagnoses) # Standard deviation of 2.001
## [1] 2.001354
boxplot(db.init$number_diagnoses, horizontal = T, col = '#83afe6', xlab = 'No. Diagnoses') # Visible outliers above upper fence and below lower fence
fences(db.init$number_diagnoses) # Lower fence is 1.5, and upper fence is 13.5
## [1] 1.5 13.5
length(subset(db.init$number_diagnoses, db.init$number_diagnoses < 1.5)) # 193 cases are below the lower fence. I.e., 193 people had 1 diagnosis, as 1 is the min
## [1] 193
length(subset(db.init$number_diagnoses, db.init$number_diagnoses > 13.5)) # 42 records have a number of diagnoses higher than the upper fence
## [1] 42
hist.number_diagnoses <- ggplot(db.init, aes(number_diagnoses)) + geom_histogram(binwidth = 2, color='black', fill = '#83afe6') + theme_bw() + labs(x = 'No. Diagnoses', y = 'Count') + theme(axis.title.y = element_blank())
hist.number_diagnoses # Skewed left from 1-9
table(db.init$number_diagnoses) # From diagnoses 1-9, the data is skewed left. There are very few cases 10 and above. Since there is no biological reason for this, this finding suggests to me that there is some reason coding for more than 9 diagnoses may be difficult in the system, so many practitioners don't bother for an inpatient visit
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 193 881 2357 4442 8881 7566 7485 7391 30704 9 6 6 10
## 14 15 16
## 5 7 30
length(subset(db.init$number_diagnoses, db.init$number_diagnoses > 9)) # 73 cases have more than 9 diagnoses
## [1] 73
The following are grouped visualizations of the numeric attributes above, specifically the histograms and boxplots.
grid.arrange(hist.time_in_hospital, hist.num_lab_procedures, hist.num_procedures, hist.num_medications, hist.number_outpatient, hist.number_emergency, hist.number_inpatient, hist.number_diagnoses, nrow = 4) # Histograms together
par(mfrow=c(4,2)) # Boxplots together
boxplot(db.init$time_in_hospital, xlab = 'Time in Hospital', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$num_lab_procedures, xlab = 'No. Lab Procedures', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$num_procedures, xlab = 'No. Procedures', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$num_medications, xlab = 'No. Medications', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_outpatient, xlab = 'No. Outpatient Visits', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_emergency, xlab = 'No. Emergency Visits', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_inpatient, xlab = 'No. Inpatient Visits', las=1, horizontal = T, col = '#83afe6')
boxplot(db.init$number_diagnoses, xlab = 'No. Diagnoses', las=1, horizontal = T, col = '#83afe6')
Now we’ll identify the overall number of outliers for the numeric attributes.
nrow(subset(db.init, db.init$time_in_hospital > 12 | db.init$num_lab_procedures > 96 | db.init$num_procedures > 5 | db.init$num_medications > 35 | db.init$number_outpatient != 0 | db.init$number_emergency != 0 | db.init$number_inpatient != 0 | db.init$number_diagnoses < 1.5 | db.init$number_diagnoses > 13.5)) # There are 23159 rows in which at least one outlier is present
## [1] 23159
23159/69973 # 33.1% of records in the dataset contain one or more outliers; these will be kept as they may all represent real cases based on their values. Additionally, neural networks are somewhat robust to outliers
## [1] 0.3309705
We will transfer the numeric attributes kept to a new clean dataframe (db.expl) for further exploratory analysis.
db.expl <- db.init[,c(10, 13:18, 22)]
We will go through each ordinal attribute and review frequencies, relative frequencies, missing values, and histograms as appropriate.
Age
table(db.init$age) # Most of cases appear to be grouped around ages 50-90 with the 70-80 group being the single largest (15684 cases); there are no missing values
##
## [0-10) [10-20) [20-30) [30-40) [40-50) [50-60) [60-70) [70-80)
## 153 534 1121 2692 6828 12349 15684 17750
## [80-90) [90-100)
## 11102 1760
prop.table(table(db.init$age)) # Largest number of cases is 70-80 at 25.4%
##
## [0-10) [10-20) [20-30) [30-40) [40-50) [50-60)
## 0.002186558 0.007631515 0.016020465 0.038471982 0.097580495 0.176482357
## [60-70) [70-80) [80-90) [90-100)
## 0.224143598 0.253669272 0.158661198 0.025152559
ggplot(db.init, aes(age)) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Age', y = 'Count') # Distribution is unimodal and skewed slightly left
db.expl$age <- db.init$age # Add variable to exploration data frame; no changes needed
Weight
sort(prop.table(table(db.init$weight))) # 96.0% of the data is missing. Attribute will be removed for analysis and modeling
##
## >200 [175-200) [150-175) [0-25) [25-50) [125-150)
## 4.287368e-05 1.286210e-04 4.573193e-04 6.573964e-04 1.171881e-03 1.843568e-03
## [100-125) [50-75) [75-100) ?
## 8.003087e-03 1.078988e-02 1.674932e-02 9.601561e-01
Max Glucose Serum
sort(prop.table(table(db.init$max_glu_serum))) # 95.2% of the time, the test was not taken. This attribute will be removed from analysis due to low variance
##
## >300 >200 Norm None
## 0.01017535 0.01337659 0.02429509 0.95215297
Any attribute with one category spanning more than 95% of the data will be removed from analysis and modeling due to low variance.
A1C Result
sort(prop.table(table(db.init$A1Cresult))) # Test not done 81.6% of the time
##
## >7 Norm >8 None
## 0.04094436 0.05346348 0.08916296 0.81642919
db.expl$A1Cresult <- droplevels(fct_collapse(db.init$A1Cresult, '>7' = c('>7', '>8'))) # Put both abnormal results (>7, >8) in the same category, as they are small samples, and I'm interested in an abnormal VS normal result; there's no medical reason for them to be considered separately. These categories will be compared to the test not being done as its own category. Add updated column to db.expl dataframe for exploration
table(db.expl$A1Cresult)
##
## >7 None Norm
## 9104 57128 3741
sort(prop.table(table(db.expl$A1Cresult)))
##
## Norm >7 None
## 0.05346348 0.13010733 0.81642919
A1C Result will be considered a categorical variable going forward, comparing the categories of the test not being done (‘None’), the test done with a normal result (‘Norm’), and the test done with an abnormal result (‘>7’).
We will go through each categorical attribute and review frequencies and relative frequencies. Any attribute with a small number of missing values will have the missing values imputed as the majority class. Categories will be condensed based on domain knowledge, and then any with under 5% of cases will be grouped into an ‘Other’ category. Attributes to be kept will be updated in the db.expl dataframe for further exploratory analysis.
Race
sort(prop.table(table(db.init$race))) # The highest occurring racial group was Caucasian and 74.7%, followed by African American at 18% and then Unknown ('?') at 2.7%
##
## Asian Other Hispanic ? AfricanAmerican
## 0.006974119 0.016434911 0.021436840 0.027410573 0.180426736
## Caucasian
## 0.747316822
db.expl$race <- droplevels(fct_collapse(db.init$race, 'Caucasian' = c('Caucasian', '?'))) # Impute unknown ('?') category as the majority class (Caucasian); will drop any empty levels here too, and add updated column to db.expl dataframe for exploration
table(db.expl$race) # Review frequencies now that Unknown was imputed
##
## Caucasian AfricanAmerican Asian Hispanic Other
## 54210 12625 488 1500 1150
sort(prop.table(table(db.expl$race))) # Now Caucasian represents 77.5% of cases
##
## Asian Other Hispanic AfricanAmerican Caucasian
## 0.006974119 0.016434911 0.021436840 0.180426736 0.774727395
Gender
table(db.init$gender) # Only 3 Unknown/Invalid cases
##
## Female Male Unknown/Invalid
## 37229 32741 3
db.expl$gender <- droplevels(fct_collapse(db.init$gender, 'Female' = c('Female', 'Unknown/Invalid'))) # Impute Unknown/Invalid category as the majority class (Female)
table(db.expl$gender)
##
## Female Male
## 37232 32741
prop.table(table(db.expl$gender)) #53.2% female, 46.8% male
##
## Female Male
## 0.5320909 0.4679091
Admission Type ID
sort(prop.table(table(db.init$admission_type_id))) # The majority came in through emergency (50.6%); Collectively 11.3% of cases are Not Mapped, Not Available or NULL. These will be assigned to the majority class (Emergency)
##
## Newborn TraumaCenter NotMapped NotAvailable NULL Urgent
## 0.0001286210 0.0002572421 0.0041587469 0.0441027253 0.0645391794 0.1829562831
## Elective Emergency
## 0.1970045589 0.5068526432
db.expl$admission_type_id <- droplevels(fct_collapse(db.init$admission_type_id, 'Emergency' = c('Emergency', 'NULL', 'NotAvailable', 'NotMapped'), 'Other' = c('TraumaCenter', 'Newborn'))) # Assign missing values to Emergency; add any categories under 5% of cases to "Other"
table(db.expl$admission_type_id)
##
## Emergency Urgent Elective Other
## 43359 12802 13785 27
sort(prop.table(table(db.expl$admission_type_id)))
##
## Other Urgent Elective Emergency
## 0.0003858631 0.1829562831 0.1970045589 0.6196532948
Discharge Disposition ID
table(db.init$discharge_disposition_id) # There are 26 levels, most of which have very few cases
##
## DcHome DcAnotherSTHospital
## 44317 1539
## DcSNF DcICF
## 8784 541
## DcOtherTypeInpatient DcHomeWithHomeService
## 913 8289
## LeftAMA DcHomeWithIVCare
## 409 73
## AdmittedInpatientThisHospital NeonateDcOtherHospital
## 9 6
## Expired StillPatientOrOutpatient
## 0 2
## HospiceHome HospiceFacility
## 0 0
## DcWithinSwingBed RefOtherOutpatient
## 40 3
## RefSameOutpatient NULL
## 8 2474
## ExpiredHome ExpiredFacility
## 0 0
## DcRehab DcLTCHospital
## 1410 260
## DcMedicaidNursingFacility NotMapped
## 25 778
## DcFederalHealthFacility DcPsychiatric
## 3 90
sort(prop.table(table(db.init$discharge_disposition_id))) #4.6% of cases unknown between Not Mapped and NULL; will assign to majority class (DcHome)
##
## Expired HospiceHome
## 0.000000e+00 0.000000e+00
## HospiceFacility ExpiredHome
## 0.000000e+00 0.000000e+00
## ExpiredFacility StillPatientOrOutpatient
## 0.000000e+00 2.858245e-05
## RefOtherOutpatient DcFederalHealthFacility
## 4.287368e-05 4.287368e-05
## NeonateDcOtherHospital RefSameOutpatient
## 8.574736e-05 1.143298e-04
## AdmittedInpatientThisHospital DcMedicaidNursingFacility
## 1.286210e-04 3.572807e-04
## DcWithinSwingBed DcHomeWithIVCare
## 5.716491e-04 1.043260e-03
## DcPsychiatric DcLTCHospital
## 1.286210e-03 3.715719e-03
## LeftAMA DcICF
## 5.845112e-03 7.731554e-03
## NotMapped DcOtherTypeInpatient
## 1.111857e-02 1.304789e-02
## DcRehab DcAnotherSTHospital
## 2.015063e-02 2.199420e-02
## NULL DcHomeWithHomeService
## 3.535649e-02 1.184600e-01
## DcSNF DcHome
## 1.255341e-01 6.333443e-01
db.expl$discharge_disposition_id <- droplevels(fct_collapse(db.init$discharge_disposition_id,
'DcHome' = c('DcHome', 'NULL', 'NotMapped'),
'DcHomeWithHomeService' = c('DcHomeWithHomeService', 'DcHomeWithIVCare'),
'DcOtherFacility' = c('DcAnotherSTHospital', 'DcICF', 'DcOtherTypeInpatient', 'NeonateDcOtherHospital', 'DcRehab', 'DcLTCHospital', 'DcMedicaidNursingFacility', 'DcFederalHealthFacility', 'DcPsychiatric'),
'Other' = c('AdmittedInpatientThisHospital', 'StillPatientOrOutpatient', 'DcWithinSwingBed', 'RefOtherOutpatient', 'RefSameOutpatient', 'LeftAMA'))) # Drop empty levels, and collapse categories based on what logically fits together; add any remaining categories under 5% of cases to "Other"
table(db.expl$discharge_disposition_id)
##
## DcHome DcOtherFacility DcSNF
## 47569 4787 8784
## DcHomeWithHomeService Other
## 8362 471
sort(prop.table(table(db.expl$discharge_disposition_id))) #DcHome now has 68.0% of cases
##
## Other DcOtherFacility DcHomeWithHomeService
## 0.006731168 0.068412102 0.119503237
## DcSNF DcHome
## 0.125534135 0.679819359
Admission Source ID
table(db.init$admission_source_id) # There are 17 levels, most of which have very few cases
##
## PhysRef ClinicRef
## 21746 908
## HMORef TransferFromHospital
## 136 2530
## TransferFromSNF TransferFromOtherHealthFacility
## 512 1785
## ER CourtOrLawEnf
## 37260 11
## NotAvailable TransferFromCriticalAccessHospital
## 95 7
## NormalDelivery SickBaby
## 1 1
## ExtramuralBirth NULL
## 2 4820
## NotMapped SepClaim
## 153 4
## TransferFromAmbSurg
## 2
sort(prop.table(table(db.init$admission_source_id))) #7.1% of cases unknown between Not Mapped, Not Available, and NULL; will assign to majority class (ER)
##
## NormalDelivery SickBaby
## 1.429123e-05 1.429123e-05
## ExtramuralBirth TransferFromAmbSurg
## 2.858245e-05 2.858245e-05
## SepClaim TransferFromCriticalAccessHospital
## 5.716491e-05 1.000386e-04
## CourtOrLawEnf NotAvailable
## 1.572035e-04 1.357667e-03
## HMORef NotMapped
## 1.943607e-03 2.186558e-03
## TransferFromSNF ClinicRef
## 7.317108e-03 1.297643e-02
## TransferFromOtherHealthFacility TransferFromHospital
## 2.550984e-02 3.615680e-02
## NULL PhysRef
## 6.888371e-02 3.107770e-01
## ER
## 5.324911e-01
db.expl$admission_source_id <- droplevels(fct_collapse(db.init$admission_source_id,
'ER' = c('ER', 'NotMapped', 'NotAvailable', 'NULL'),
'TransferExtFacility' = c('TransferFromHospital', 'TransferFromOtherHealthFacility', 'TransferFromSNF', 'ClinicRef', 'HMORef', 'TransferFromCriticalAccessHospital', 'TransferFromAmbSurg'),
'Other' = c('CourtOrLawEnf', 'NormalDelivery', 'SickBaby', 'ExtramuralBirth', 'SepClaim'))) # Drop empty levels, and collapse categories based on what logically fits together; add any remaining categories under 5% of cases to "Other"
table(db.expl$admission_source_id)
##
## PhysRef TransferExtFacility ER Other
## 21746 5880 42328 19
sort(prop.table(table(db.expl$admission_source_id)))
##
## Other TransferExtFacility PhysRef ER
## 0.0002715333 0.0840324125 0.3107770140 0.6049190402
Payer Code
levels(db.init$payer_code) # 18 Levels; unsure of what codes mean as most are not standardized and the original researchers gave no indication
## [1] "?" "BC" "CH" "CM" "CP" "DM" "FR" "HM" "MC" "MD" "MP" "OG" "OT" "PO" "SI"
## [16] "SP" "UN" "WC"
sort(prop.table(table(db.init$payer_code))) # Given the attribute is 43.5% missing, and we are unable to determine with any certainty the meanings behind categories, the attribute will not be used for analysis or modeling
##
## FR MP SI OT CH WC
## 1.429123e-05 4.573193e-04 5.287754e-04 8.860561e-04 1.614909e-03 1.672074e-03
## DM PO OG CM UN CP
## 5.316336e-03 6.531091e-03 9.246424e-03 1.850714e-02 2.651023e-02 2.771069e-02
## MD SP BC HM MC ?
## 3.094051e-02 4.720392e-02 4.854730e-02 5.693625e-02 2.827090e-01 4.346677e-01
Medical Specialty
levels(db.init$medical_specialty) # 73 categories
## [1] "?"
## [2] "AllergyandImmunology"
## [3] "Anesthesiology"
## [4] "Anesthesiology-Pediatric"
## [5] "Cardiology"
## [6] "Cardiology-Pediatric"
## [7] "DCPTEAM"
## [8] "Dentistry"
## [9] "Dermatology"
## [10] "Emergency/Trauma"
## [11] "Endocrinology"
## [12] "Endocrinology-Metabolism"
## [13] "Family/GeneralPractice"
## [14] "Gastroenterology"
## [15] "Gynecology"
## [16] "Hematology"
## [17] "Hematology/Oncology"
## [18] "Hospitalist"
## [19] "InfectiousDiseases"
## [20] "InternalMedicine"
## [21] "Nephrology"
## [22] "Neurology"
## [23] "Neurophysiology"
## [24] "Obsterics&Gynecology-GynecologicOnco"
## [25] "Obstetrics"
## [26] "ObstetricsandGynecology"
## [27] "Oncology"
## [28] "Ophthalmology"
## [29] "Orthopedics"
## [30] "Orthopedics-Reconstructive"
## [31] "Osteopath"
## [32] "Otolaryngology"
## [33] "OutreachServices"
## [34] "Pathology"
## [35] "Pediatrics"
## [36] "Pediatrics-AllergyandImmunology"
## [37] "Pediatrics-CriticalCare"
## [38] "Pediatrics-EmergencyMedicine"
## [39] "Pediatrics-Endocrinology"
## [40] "Pediatrics-Hematology-Oncology"
## [41] "Pediatrics-InfectiousDiseases"
## [42] "Pediatrics-Neurology"
## [43] "Pediatrics-Pulmonology"
## [44] "Perinatology"
## [45] "PhysicalMedicineandRehabilitation"
## [46] "PhysicianNotFound"
## [47] "Podiatry"
## [48] "Proctology"
## [49] "Psychiatry"
## [50] "Psychiatry-Addictive"
## [51] "Psychiatry-Child/Adolescent"
## [52] "Psychology"
## [53] "Pulmonology"
## [54] "Radiologist"
## [55] "Radiology"
## [56] "Resident"
## [57] "Rheumatology"
## [58] "Speech"
## [59] "SportsMedicine"
## [60] "Surgeon"
## [61] "Surgery-Cardiovascular"
## [62] "Surgery-Cardiovascular/Thoracic"
## [63] "Surgery-Colon&Rectal"
## [64] "Surgery-General"
## [65] "Surgery-Maxillofacial"
## [66] "Surgery-Neuro"
## [67] "Surgery-Pediatric"
## [68] "Surgery-Plastic"
## [69] "Surgery-PlasticwithinHeadandNeck"
## [70] "Surgery-Thoracic"
## [71] "Surgery-Vascular"
## [72] "SurgicalSpecialty"
## [73] "Urology"
sort(prop.table(table(db.init$medical_specialty))) # 48.1% missing ('?'), and may be related to primary diagnosis (another attribute we have); attribute to be removed for analysis and modeling
##
## Pediatrics-AllergyandImmunology Pediatrics-InfectiousDiseases
## 0.000000e+00 0.000000e+00
## Dermatology Neurophysiology
## 1.429123e-05 1.429123e-05
## Perinatology Proctology
## 1.429123e-05 1.429123e-05
## Psychiatry-Addictive Resident
## 1.429123e-05 1.429123e-05
## Speech SportsMedicine
## 1.429123e-05 1.429123e-05
## Surgery-PlasticwithinHeadandNeck Pediatrics-EmergencyMedicine
## 1.429123e-05 4.287368e-05
## Pediatrics-Hematology-Oncology DCPTEAM
## 4.287368e-05 5.716491e-05
## Dentistry AllergyandImmunology
## 5.716491e-05 8.574736e-05
## Pathology Pediatrics-Pulmonology
## 8.574736e-05 8.574736e-05
## Psychiatry-Child/Adolescent Surgery-Pediatric
## 8.574736e-05 8.574736e-05
## Anesthesiology Cardiology-Pediatric
## 1.000386e-04 1.000386e-04
## Endocrinology-Metabolism Pediatrics-Neurology
## 1.000386e-04 1.000386e-04
## PhysicianNotFound Surgery-Maxillofacial
## 1.000386e-04 1.143298e-04
## OutreachServices Surgery-Colon&Rectal
## 1.286210e-04 1.286210e-04
## Rheumatology Anesthesiology-Pediatric
## 1.429123e-04 1.857859e-04
## Obstetrics Obsterics&Gynecology-GynecologicOnco
## 2.429509e-04 2.572421e-04
## SurgicalSpecialty InfectiousDiseases
## 3.715719e-04 4.144456e-04
## Surgery-Plastic Hematology
## 4.144456e-04 4.430280e-04
## Ophthalmology Hospitalist
## 5.001929e-04 5.144842e-04
## Osteopath Radiology
## 5.287754e-04 5.430666e-04
## Surgeon Psychology
## 5.716491e-04 7.574350e-04
## Gynecology Podiatry
## 7.717262e-04 9.003473e-04
## Pediatrics-CriticalCare Surgery-Cardiovascular
## 1.043260e-03 1.214754e-03
## Surgery-Thoracic Endocrinology
## 1.300502e-03 1.386249e-03
## Hematology/Oncology Otolaryngology
## 1.557744e-03 1.572035e-03
## Pediatrics-Endocrinology Neurology
## 2.100810e-03 2.386635e-03
## PhysicalMedicineandRehabilitation Pediatrics
## 2.772498e-03 2.786789e-03
## Oncology Surgery-Vascular
## 2.929701e-03 5.130550e-03
## Gastroenterology Surgery-Neuro
## 5.473540e-03 5.773656e-03
## Surgery-Cardiovascular/Thoracic Urology
## 6.974119e-03 7.574350e-03
## ObstetricsandGynecology Psychiatry
## 8.474697e-03 8.760522e-03
## Pulmonology Nephrology
## 9.103511e-03 1.139011e-02
## Radiologist Orthopedics-Reconstructive
## 1.173310e-02 1.487717e-02
## Orthopedics Surgery-General
## 1.612050e-02 3.151215e-02
## Cardiology Emergency/Trauma
## 6.012319e-02 6.278136e-02
## Family/GeneralPractice InternalMedicine
## 7.114173e-02 1.520729e-01
## ?
## 4.807426e-01
Diagnosis 1
length(levels(db.init$diag_1)) # 717 distinct primary diagnoses; a bit cumbersome to analyze
## [1] 717
sort(table(db.init$diag_1)) # Code 414 has the highest number of cases (5209), which is "Other forms of chronic ischemic heart disease". Next is code 428 at 3876 records, which is Heart Failure. 3rd highest is code 786 at 3040 records, which is "Symptoms involving respiratory system and other chest symptoms". Diabetes mellitus appears to have several thousand cases as well, but we can't easily tell as it's broken up into about 10 categories (250, 250.xx)
##
## 23 271 279 372 373 389 506 523 543 58 698
## 0 0 0 0 0 0 0 0 0 0 0
## 731 804 827 895 919 97 973 974 988 V07 V66
## 0 0 0 0 0 0 0 0 0 0 0
## 10 114 131 133 143 145 148 160 207 216 217
## 1 1 1 1 1 1 1 1 1 1 1
## 219 229 250.51 262 27 299 314 318 325 334 347
## 1 1 1 1 1 1 1 1 1 1 1
## 363 365 366 375 381 391 412 448 471 477 500
## 1 1 1 1 1 1 1 1 1 1 1
## 52 57 580 605 61 615 634 637 640 649 653
## 1 1 1 1 1 1 1 1 1 1 1
## 669 671 674 684 690 691 700 703 704 720 75
## 1 1 1 1 1 1 1 1 1 1 1
## 791 803 817 826 832 833 834 837 838 839 84
## 1 1 1 1 1 1 1 1 1 1 1
## 842 848 870 878 885 903 904 906 911 915 917
## 1 1 1 1 1 1 1 1 1 1 1
## 923 939 944 955 957 963 971 975 976 98 980
## 1 1 1 1 1 1 1 1 1 1 1
## 982 994 E909 V25 V43 V51 V60 V67 V70 110 115
## 1 1 1 1 1 1 1 1 1 2 2
## 146 147 149 164 170 173 187 194 195 208 236
## 2 2 2 2 2 2 2 2 2 2 2
## 246 250.53 250.91 308 31 322 324 336 352 36 360
## 2 2 2 2 2 2 2 2 2 2 2
## 369 377 384 385 39 417 422 48 501 542 582
## 2 2 2 2 2 2 2 2 2 2 2
## 583 610 623 632 643 645 657 683 696 7 706
## 2 2 2 2 2 2 2 2 2 2 2
## 732 735 753 806 862 868 871 875 880 897 913
## 2 2 2 2 2 2 2 2 2 2 2
## 914 934 936 941 983 990 V26 172 179 192 240
## 2 2 2 2 2 2 2 3 3 3 3
## 245 250.52 261 266 356 370 374 382 405 483 508
## 3 3 3 3 3 3 3 3 3 3 3
## 524 529 602 603 619 665 709 795 797 846 866
## 3 3 3 3 3 3 3 3 3 3 3
## 879 916 921 928 942 986 987 136 142 175 250.5
## 3 3 3 3 3 3 3 4 4 4 4
## 272 301 34 341 379 388 397 452 470 474 495
## 4 4 4 4 4 4 4 4 4 4 4
## 570 598 633 647 734 745 747 792 793 796 800
## 4 4 4 4 4 4 4 4 4 4 4
## 831 836 863 886 94 141 201 215 228 250.9 273
## 4 4 4 4 4 5 5 5 5 5 5
## 342 344 361 395 41 445 463 49 551 622 655
## 5 5 5 5 5 5 5 5 5 5 5
## 685 692 705 717 751 759 814 815 835 854 881
## 5 5 5 5 5 5 5 5 5 5 5
## 890 893 968 991 V45 163 180 214 223 237 306
## 5 5 5 5 5 6 6 6 6 6 6
## 320 335 353 354 457 526 541 579 641 66 663
## 6 6 6 6 6 6 6 6 6 6 6
## 686 694 725 82 843 883 892 952 964 V63 117
## 6 6 6 6 6 6 6 6 6 6 7
## 210 212 250.21 337 359 362 383 464 588 708 746
## 7 7 7 7 7 7 7 7 7 7 7
## 78 816 845 847 865 88 11 158 250.31 350 454
## 7 7 7 7 7 7 8 8 8 8 8
## 5 521 695 737 864 891 970 992 V71 152 171
## 8 8 8 8 8 8 8 8 8 9 9
## 230 263 312 35 394 565 646 810 882 935 966
## 9 9 9 9 9 9 9 9 9 9 9
## 967 ? 161 184 3 323 338 376 513 652 736
## 9 10 10 10 10 10 10 10 10 10 10
## 861 867 933 977 989 199 200 244 297 368 462
## 10 10 10 10 10 11 11 11 11 11 11
## 54 840 V56 156 250.43 250.93 281 310 358 494 533
## 11 11 11 12 12 12 12 12 12 12 12
## 581 591 594 660 680 693 853 945 250.33 251 355
## 12 12 12 12 12 12 12 12 13 13 13
## 475 480 586 601 718 250.3 250.32 289 357 378 756
## 13 13 13 13 13 14 14 14 14 14 14
## 873 972 204 255 283 627 958 252 461 514 783
## 14 14 15 15 15 15 15 16 16 16 16
## 242 380 473 617 710 999 135 226 282 446 527
## 17 17 17 17 17 17 18 18 18 18 18
## 528 614 656 664 205 250.23 307 429 430 456 568
## 18 18 18 18 19 19 19 19 19 19 19
## 607 616 825 196 305 327 333 421 534 621 794
## 19 19 19 20 20 20 20 20 20 20 20
## 844 920 277 802 522 150 193 203 235 239 396
## 20 20 21 21 22 23 23 23 23 23 23
## 47 510 516 644 658 183 309 442 478 567 611
## 23 23 23 23 23 24 24 24 24 24 24
## 661 851 340 349 492 304 351 788 801 233 596
## 24 24 25 25 25 26 26 26 26 27 27
## 604 714 860 70 42 512 519 573 659 9 922
## 27 27 27 28 29 29 29 29 29 31 31
## 253 275 293 451 485 959 155 286 556 V53 191
## 32 32 32 32 32 32 33 33 33 33 34
## 465 284 332 V54 620 719 965 250.92 220 238 555
## 34 35 35 35 36 36 37 38 39 39 39
## 924 225 227 550 727 423 447 585 822 151 311
## 39 41 41 41 41 42 42 42 42 43 43
## 416 496 53 V55 716 962 112 298 303 969 288
## 43 43 43 43 44 44 45 45 45 45 47
## 738 420 799 250.01 241 432 608 995 250.42 642 850
## 47 48 50 51 52 52 52 52 53 53 53
## 711 300 443 481 438 625 287 290 459 807 425
## 54 55 55 56 58 58 59 60 61 62 63
## 723 782 490 537 346 398 595 250.41 593 250.83 785
## 63 63 64 64 65 65 65 66 66 68 68
## 813 455 79 345 437 729 274 291 576 626 292
## 68 69 69 70 70 70 71 71 71 71 73
## 202 566 726 154 157 294 852 781 790 188 681
## 74 74 75 76 76 76 77 78 79 80 80
## 487 654 515 536 348 250.2 211 728 189 808 413
## 81 81 85 85 87 90 92 94 95 95 96
## 441 564 182 823 444 431 784 553 572 V58 218
## 98 99 100 103 107 108 108 109 109 115 116
## 386 250.81 250.22 174 821 331 557 436 805 730 424
## 116 117 118 123 124 133 134 137 138 140 141
## 198 618 721 404 511 185 575 707 787 532 600
## 148 149 149 152 153 155 155 156 162 165 165
## 571 540 250.03 569 197 466 552 812 250.4 482 411
## 166 170 173 177 179 182 183 188 190 193 194
## 250 280 285 531 250.1 558 426 648 590 997 153
## 200 207 211 212 218 230 241 242 249 249 250
## 401 733 824 458 250.82 162 295 8 592 250.12 535
## 250 250 254 260 264 265 275 275 284 293 296
## 415 402 403 507 724 278 453 530 789 578 250.11
## 304 306 310 314 339 357 360 364 365 410 427
## 998 250.13 250.7 250.02 560 518 440 433 250.6 296 V57
## 451 516 522 523 561 562 607 608 631 634 654
## 722 493 577 562 435 574 820 584 599 38 250.8
## 660 676 682 709 752 773 815 918 975 998 1074
## 996 276 491 780 682 434 715 427 486 410 786
## 1106 1180 1313 1409 1463 1514 1907 2019 2362 2774 3040
## 428 414
## 3876 5209
# We will create a new attribute with larger categories for the diagnoses so they are less cumbersome. The categories will be based on ICD-9-CM categories, with the exception of diabetes mellitus as its own category so that it may be analyzed. The unknown category ('?') will be kept for this step. Unfortunately, we have to use Regex to condense the categories as the attribute is currently a factor with some categories containing letters
db.expl$diag_1 <- rep('?', times = nrow(db.init))
db.expl$diag_1[grepl("^0|^[^EeVv?]$|^[^EeVv][0-9]$|^1[0123][0-9]$",db.init$diag_1)] <- "001-139"
db.expl$diag_1[grepl("^1[4-9][0-9]$|^2[0-3][0-9]$",db.init$diag_1)] <- "140-239"
db.expl$diag_1[grepl("^250.*",db.init$diag_1)] <- "250.0-250.9"
db.expl$diag_1[grepl("^2[467][0-9]$|^25[1-9]$",db.init$diag_1)] <- "240-279(not250)"
db.expl$diag_1[grepl("^28[0-9]$",db.init$diag_1)] <- "280-289"
db.expl$diag_1[grepl("^29[0-9]$|^3[01][0-9]$",db.init$diag_1)] <- "290-319"
db.expl$diag_1[grepl("^3[2-8][0-9]$",db.init$diag_1)] <- "320-389"
db.expl$diag_1[grepl("^39[0-9]$|^4[0-5][0-9]$",db.init$diag_1)] <- "390-459"
db.expl$diag_1[grepl("^4[6-9][0-9]$|^5[01][0-9]$",db.init$diag_1)] <- "460-519"
db.expl$diag_1[grepl("^5[2-7][0-9]$",db.init$diag_1)] <- "520-579"
db.expl$diag_1[grepl("^5[89][0-9]$|^6[0-2][0-9]$",db.init$diag_1)] <- "580-629"
db.expl$diag_1[grepl("^6[3-7][0-9]$",db.init$diag_1)] <- "630-679"
db.expl$diag_1[grepl("^6[89][0-9]$|^70[0-9]$",db.init$diag_1)] <- "680-709"
db.expl$diag_1[grepl("^7[1-3][0-9]$",db.init$diag_1)] <- "710-739"
db.expl$diag_1[grepl("^7[45][0-9]$",db.init$diag_1)] <- "740-759"
db.expl$diag_1[grepl("^7[67][0-9]$",db.init$diag_1)] <- "760-779"
db.expl$diag_1[grepl("^7[89][0-9]$",db.init$diag_1)] <- "780-799"
db.expl$diag_1[grepl("^[89][0-9]{2}$",db.init$diag_1)] <- "800-999"
db.expl$diag_1[grepl("^[Vv]",db.init$diag_1)] <- "V01-V91"
db.expl$diag_1[grepl("^[Ee]",db.init$diag_1)] <- "E000-E999"
db.expl$diag_1 <- as.factor(db.expl$diag_1) # Convert from character to factor
levels(db.expl$diag_1) # Now has 20 levels
## [1] "?" "001-139" "140-239" "240-279(not250)"
## [5] "250.0-250.9" "280-289" "290-319" "320-389"
## [9] "390-459" "460-519" "520-579" "580-629"
## [13] "630-679" "680-709" "710-739" "740-759"
## [17] "780-799" "800-999" "E000-E999" "V01-V91"
sort(prop.table(table(db.expl$diag_1))) # Largest group of people had a primary diagnosis of a circulatory disorder (30.5%); 0.001% of cases are missing; will impute to majority class (390-459)
##
## E000-E999 ? 740-759 630-679 280-289
## 1.429123e-05 1.429123e-04 5.859403e-04 8.374659e-03 9.303589e-03
## 320-389 V01-V91 290-319 001-139 680-709
## 1.226187e-02 1.311935e-02 2.207995e-02 2.408072e-02 2.543838e-02
## 240-279(not250) 140-239 580-629 710-739 800-999
## 2.643877e-02 3.627113e-02 4.879025e-02 5.807954e-02 6.708302e-02
## 780-799 250.0-250.9 520-579 460-519 390-459
## 7.864462e-02 8.214597e-02 9.039201e-02 9.212125e-02 3.046318e-01
ggplot(db.expl, aes(fct_infreq(diag_1))) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Primary Diagnosis', y = 'Count') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Visualize groups
# Want to reduce the levels further; will group anything under 5% into an "Other" category
diag1levels <- levels(db.expl$diag_1)[c(2:4,6:8,12:14,16,19,20)]
db.expl$diag_1 <- droplevels(fct_collapse(db.expl$diag_1, '390-459' = c('390-459', '?'), 'Other' = diag1levels)) # Drop empty levels, and collapse categories - kept the categories over 5%; put rest in "Other"
levels(db.expl$diag_1) #Reduced to 8 levels to be more manageable for modeling
## [1] "390-459" "Other" "250.0-250.9" "460-519" "520-579"
## [6] "710-739" "780-799" "800-999"
table(db.expl$diag_1)
##
## 390-459 Other 250.0-250.9 460-519 520-579 710-739
## 21326 15867 5748 6446 6325 4064
## 780-799 800-999
## 5503 4694
sort(prop.table(table(db.expl$diag_1))) # Other category contains 22.7% of cases
##
## 710-739 800-999 780-799 250.0-250.9 520-579 460-519
## 0.05807954 0.06708302 0.07864462 0.08214597 0.09039201 0.09212125
## Other 390-459
## 0.22675889 0.30477470
Diagnosis 2
length(levels(db.init$diag_2)) # 749 distinct secondary diagnoses
## [1] 749
sort(table(db.init$diag_2)) # Code 250 has the highest number of cases (4996); however, there are likely more, as 250 is split into around 10 categories. The second highest is 276 (4494 cases), which is "Disorders of fluid electrolyte and acid-base balance". Third highest is code 428 (4218 cases), which is Heart Failure.
##
## 115 130 195 250.21 250.3 250.31 325 341 350 364 52
## 0 0 0 0 0 0 0 0 0 0 0
## 695 754 806 846 884 894 912 96 975 990 E813
## 0 0 0 0 0 0 0 0 0 0 0
## E941 E945 V69 111 114 123 137 140 145 163 164
## 0 0 0 1 1 1 1 1 1 1 1
## 171 173 180 182 186 192 212 223 232 235 256
## 1 1 1 1 1 1 1 1 1 1 1
## 268 27 270 271 302 31 316 347 35 353 366
## 1 1 1 1 1 1 1 1 1 1 1
## 374 388 46 460 472 475 483 5 506 520 523
## 1 1 1 1 1 1 1 1 1 1 1
## 529 534 580 602 605 610 615 649 658 66 665
## 1 1 1 1 1 1 1 1 1 1 1
## 670 674 683 684 685 694 7 702 703 704 734
## 1 1 1 1 1 1 1 1 1 1 1
## 748 75 752 800 811 832 833 843 853 863 871
## 1 1 1 1 1 1 1 1 1 1 1
## 879 88 880 893 908 911 915 916 917 927 942
## 1 1 1 1 1 1 1 1 1 1 1
## 944 947 948 953 955 962 963 968 974 977 987
## 1 1 1 1 1 1 1 1 1 1 1
## 99 992 994 E817 E818 E821 E826 E829 E850 E854 E868
## 1 1 1 1 1 1 1 1 1 1 1
## E882 E883 E890 E900 E915 E918 E919 E929 E938 E965 E968
## 1 1 1 1 1 1 1 1 1 1 1
## E980 V03 V11 V13 V25 V50 V55 V60 V61 V66 11
## 1 1 1 1 1 1 1 1 1 1 2
## 141 179 215 217 226 228 240 246 259 269 318
## 2 2 2 2 2 2 2 2 2 2 2
## 352 360 376 377 379 381 383 405 464 474 477
## 2 2 2 2 2 2 2 2 2 2 2
## 484 501 543 622 634 641 645 654 661 663 686
## 2 2 2 2 2 2 2 2 2 2 2
## 742 751 755 78 795 797 801 814 822 831 837
## 2 2 2 2 2 2 2 2 2 2 2
## 862 866 869 872 906 918 933 945 952 967 972
## 2 2 2 2 2 2 2 2 2 2 2
## 980 991 E814 E819 E853 E887 E916 E924 E931 V16 V53
## 2 2 2 2 2 2 2 2 2 2 2
## V86 131 152 225 239 250.23 250.32 297 306 308 320
## 2 3 3 3 3 3 3 3 3 3 3
## 351 355 373 395 422 448 463 470 495 500 508
## 3 3 3 3 3 3 3 3 3 3 3
## 513 522 527 542 588 603 619 656 659 664 691
## 3 3 3 3 3 3 3 3 3 3 3
## 692 725 747 815 826 842 870 913 919 921 922
## 3 3 3 3 3 3 3 3 3 3 3
## 965 E816 E858 E870 E905 E906 E930 E937 V09 V14 V23
## 3 3 3 3 3 3 3 3 3 3 3
## 193 250.33 258 310 323 324 369 454 457 480 579
## 4 4 4 4 4 4 4 4 4 4 4
## 623 647 652 705 712 713 758 810 844 851 892
## 4 4 4 4 4 4 4 4 4 4 4
## 907 909 989 E881 E927 E936 V02 172 227 245 317
## 4 4 4 4 4 4 4 5 5 5 5
## 335 336 343 372 430 452 521 594 598 644 698
## 5 5 5 5 5 5 5 5 5 5 5
## 741 759 793 865 868 934 969 E917 E933 110 136
## 5 5 5 5 5 5 5 5 5 6 6
## 191 208 214 250.91 273 322 380 382 389 494 510
## 6 6 6 6 6 6 6 6 6 6 6
## 514 528 565 604 709 750 791 847 864 867 891
## 6 6 6 6 6 6 6 6 6 6 6
## 9 910 V08 V44 183 188 199 220 241 250.9 260
## 6 6 6 6 7 7 7 7 7 7 7
## 262 299 40 421 524 54 540 586 646 701 792
## 7 7 7 7 7 7 7 7 7 7 7
## 796 836 861 882 E812 V70 117 156 251 266 34
## 7 7 7 7 7 7 8 8 8 8 8
## 365 485 517 533 570 608 621 845 883 905 923
## 8 8 8 8 8 8 8 8 8 8 8
## E934 E950 V18 250.2 261 279 333 338 354 461 627
## 8 8 8 9 9 9 9 9 9 9 9
## 706 737 825 V17 V46 V57 200 281 307 356 423
## 9 9 9 9 9 9 10 10 10 10 10
## 693 696 717 816 881 959 E939 E947 233 314 319
## 10 10 10 10 10 10 10 10 11 11 11
## 431 442 550 745 756 823 E928 E944 250.1 359 378
## 11 11 11 11 11 11 11 11 12 12 12
## 487 555 718 958 201 840 852 E879 V72 151 250.93
## 12 12 12 12 13 13 13 13 13 14 14
## 283 556 607 614 753 824 V49 185 312 346 394
## 14 14 14 14 14 14 14 15 15 15 15
## 432 436 462 583 783 E880 238 566 736 746 860
## 15 15 15 15 15 15 16 16 16 16 16
## 94 138 250.53 298 516 802 821 155 174 252 274
## 16 17 17 17 17 17 17 18 18 18 18
## 349 420 611 617 680 716 E884 V64 358 386 447
## 18 18 18 18 18 18 18 18 19 19 19
## 519 601 616 924 368 532 537 738 813 920 E935
## 19 19 19 19 20 20 20 20 20 20 20
## V65 253 150 250.43 446 557 362 455 478 626 726
## 20 21 22 22 22 22 23 23 23 23 23
## 327 345 642 727 429 850 999 E942 332 53 79
## 24 24 24 24 25 25 25 25 26 26 26
## V63 250.83 473 189 250.22 289 451 620 812 154 275
## 26 27 27 28 28 28 28 28 28 29 29
## 398 820 290 441 481 711 714 808 282 337 531
## 29 29 30 30 30 31 31 31 32 32 32
## 600 E932 618 205 681 V54 873 459 573 512 552
## 32 32 33 34 34 34 35 36 37 38 38
## 277 309 211 465 572 301 490 625 807 V12 456
## 39 39 40 40 40 41 41 41 41 41 42
## 596 157 218 250.52 V62 344 721 255 203 723 E849
## 42 43 43 43 43 44 45 47 48 48 48
## 794 443 595 291 581 250.51 250.11 250.5 250.81 437 568
## 49 51 51 52 52 53 54 54 54 54 54
## 397 42 731 357 564 782 293 311 575 288 576
## 55 55 55 56 56 56 57 57 57 58 59
## 592 242 250.42 553 567 805 482 250.12 250.13 416 435
## 59 60 60 60 60 60 62 64 65 65 65
## 292 719 153 415 444 340 558 135 250.7 204 729
## 68 68 71 71 72 75 75 76 76 77 77
## 995 286 348 466 V15 E878 331 412 E888 V10 593
## 77 79 79 79 79 80 81 84 84 85 87
## V43 728 536 E885 284 304 507 715 781 562 590
## 88 90 93 93 94 95 95 95 101 103 103
## 202 250.4 722 515 263 296 458 724 V58 250.82 300
## 104 106 106 108 109 109 109 109 109 110 110
## 434 8 250.41 433 244 294 112 438 733 404 V85
## 112 113 117 117 118 118 121 123 124 125 127
## 569 710 396 250.8 535 790 730 784 250.92 787 492
## 128 130 132 138 141 141 142 143 148 150 154
## 250.03 V42 196 342 591 799 303 70 198 789 530
## 164 173 174 183 184 190 192 192 206 209 212
## 287 278 453 162 578 571 197 38 426 577 402
## 220 222 224 226 226 232 236 243 246 250 251
## 440 V45 560 648 41 788 295 ? 574 785 996
## 260 265 273 283 286 286 291 293 297 297 304
## 997 511 272 410 998 280 786 305 250.6 493 518
## 304 314 348 363 429 432 480 504 584 627 765
## 413 424 486 425 491 585 682 584 780 285 250.01
## 810 813 830 868 881 908 927 1004 1022 1108 1120
## 707 250.02 403 414 411 496 599 401 427 428 276
## 1183 1542 1576 1977 2034 2188 2210 3077 3445 4218 4494
## 250
## 4996
# Same as for diagnosis 1, we will create a new attribute with larger categories for the diagnoses.
db.expl$diag_2 <- rep('?', times = nrow(db.init))
db.expl$diag_2[grepl("^0|^[^EeVv?]$|^[^EeVv][0-9]$|^1[0123][0-9]$",db.init$diag_2)] <- "001-139"
db.expl$diag_2[grepl("^1[4-9][0-9]$|^2[0-3][0-9]$",db.init$diag_2)] <- "140-239"
db.expl$diag_2[grepl("^250.*",db.init$diag_2)] <- "250.0-250.9"
db.expl$diag_2[grepl("^2[467][0-9]$|^25[1-9]$",db.init$diag_2)] <- "240-279(not250)"
db.expl$diag_2[grepl("^28[0-9]$",db.init$diag_2)] <- "280-289"
db.expl$diag_2[grepl("^29[0-9]$|^3[01][0-9]$",db.init$diag_2)] <- "290-319"
db.expl$diag_2[grepl("^3[2-8][0-9]$",db.init$diag_2)] <- "320-389"
db.expl$diag_2[grepl("^39[0-9]$|^4[0-5][0-9]$",db.init$diag_2)] <- "390-459"
db.expl$diag_2[grepl("^4[6-9][0-9]$|^5[01][0-9]$",db.init$diag_2)] <- "460-519"
db.expl$diag_2[grepl("^5[2-7][0-9]$",db.init$diag_2)] <- "520-579"
db.expl$diag_2[grepl("^5[89][0-9]$|^6[0-2][0-9]$",db.init$diag_2)] <- "580-629"
db.expl$diag_2[grepl("^6[3-7][0-9]$",db.init$diag_2)] <- "630-679"
db.expl$diag_2[grepl("^6[89][0-9]$|^70[0-9]$",db.init$diag_2)] <- "680-709"
db.expl$diag_2[grepl("^7[1-3][0-9]$",db.init$diag_2)] <- "710-739"
db.expl$diag_2[grepl("^7[45][0-9]$",db.init$diag_2)] <- "740-759"
db.expl$diag_2[grepl("^7[67][0-9]$",db.init$diag_2)] <- "760-779"
db.expl$diag_2[grepl("^7[89][0-9]$",db.init$diag_2)] <- "780-799"
db.expl$diag_2[grepl("^[89][0-9]{2}$",db.init$diag_2)] <- "800-999"
db.expl$diag_2[grepl("^[Vv]",db.init$diag_2)] <- "V01-V91"
db.expl$diag_2[grepl("^[Ee]",db.init$diag_2)] <- "E000-E999"
db.expl$diag_2 <- as.factor(db.expl$diag_2) # Convert from character to factor
levels(db.expl$diag_2) # Now has 20 levels
## [1] "?" "001-139" "140-239" "240-279(not250)"
## [5] "250.0-250.9" "280-289" "290-319" "320-389"
## [9] "390-459" "460-519" "520-579" "580-629"
## [13] "630-679" "680-709" "710-739" "740-759"
## [17] "780-799" "800-999" "E000-E999" "V01-V91"
sort(prop.table(table(db.expl$diag_2))) # Largest group of people had a secondary diagnosis of a circulatory disorder (31.1%); 0.4% missing, will impute to majority class (390-459)
##
## 740-759 ? 630-679 E000-E999 320-389
## 0.001186172 0.004187329 0.005044803 0.008145999 0.012776357
## V01-V91 001-139 710-739 140-239 800-999
## 0.017392423 0.017721121 0.018507138 0.022851671 0.026067197
## 290-319 280-289 680-709 520-579 780-799
## 0.026524517 0.029654295 0.031840853 0.038643477 0.045274606
## 580-629 240-279(not250) 460-519 250.0-250.9 390-459
## 0.072056365 0.080102325 0.092106956 0.138624898 0.311291498
ggplot(db.expl, aes(fct_infreq(diag_2))) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Secondary Diagnosis', y = 'Count') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Visualize groups
# Want to reduce the levels further; will group anything under 5% into an "Other" category
diag2levels <- levels(db.expl$diag_2)[c(2,3,6:8,11,13:20)]
db.expl$diag_2 <- droplevels(fct_collapse(db.expl$diag_2, '390-459' = c('390-459', '?'), Other = diag2levels)) # Drop empty levels, and collapse categories - kept the categories over 5%; put rest in "Other"
levels(db.expl$diag_2) #Reduced to 6 levels
## [1] "390-459" "Other" "240-279(not250)" "250.0-250.9"
## [5] "460-519" "580-629"
table(db.expl$diag_2)
##
## 390-459 Other 240-279(not250) 250.0-250.9 460-519
## 22075 21106 5605 9700 6445
## 580-629
## 5042
sort(prop.table(table(db.expl$diag_2))) # Other category contains 30.2% of cases
##
## 580-629 240-279(not250) 460-519 250.0-250.9 Other
## 0.07205636 0.08010233 0.09210696 0.13862490 0.30163063
## 390-459
## 0.31547883
Diagnosis 3
length(levels(db.init$diag_3)) # 790 distinct tertiary diagnoses
## [1] 790
sort(table(db.init$diag_3)) # Code 250 has the highest number of cases (8980); however, there are likely more, as 250 is split into around 10 categories. The second highest is 401 (6549 cases), which is "Essential hypertension". Third highest is code 276 (3302 cases), which is "Disorders of fluid electrolyte and acid-base balance"
##
## 11 111 132 146 175 186 215 230 236 3 448
## 0 0 0 0 0 0 0 0 0 0 0
## 475 483 485 538 671 732 848 863 870 88 884
## 0 0 0 0 0 0 0 0 0 0 0
## 935 942 944 980 992 E864 E876 E915 E955 E987 V22
## 0 0 0 0 0 0 0 0 0 0 0
## 115 122 123 139 14 141 148 152 158 161 164
## 1 1 1 1 1 1 1 1 1 1 1
## 17 171 193 195 217 223 226 243 250.21 250.3 250.31
## 1 1 1 1 1 1 1 1 1 1 1
## 258 265 27 271 308 314 315 341 361 365.44 370
## 1 1 1 1 1 1 1 1 1 1 1
## 381 385 387 391 395 430 460 463 47 480 484
## 1 1 1 1 1 1 1 1 1 1 1
## 49 524 540 542 57 602 603 622 643 657 66
## 1 1 1 1 1 1 1 1 1 1 1
## 669 674 684 690 697 744 75 750 754 755 757
## 1 1 1 1 1 1 1 1 1 1 1
## 759 800 801 811 822 834 837 838 841 844 853
## 1 1 1 1 1 1 1 1 1 1 1
## 854 864 871 872 875 876 877 879 880 890 893
## 1 1 1 1 1 1 1 1 1 1 1
## 928 930 943 951 952 953 955 967 970 971 972
## 1 1 1 1 1 1 1 1 1 1 1
## 987 989 E815 E822 E825 E826 E828 E852 E853 E854 E855
## 1 1 1 1 1 1 1 1 1 1 1
## E861 E865 E882 E883 E886 E892 E894 E900 E901 E904 E912
## 1 1 1 1 1 1 1 1 1 1 1
## E922 E943 E945 E949 E965 E966 V01 V06 V07 136 173
## 1 1 1 1 1 1 1 1 1 2 2
## 180 192 216 220 250.23 262 268 313 323 350 353
## 2 2 2 2 2 2 2 2 2 2 2
## 360 374 376 377 384 421 445 500 508 523 529
## 2 2 2 2 2 2 2 2 2 2 2
## 534 550 597 624 641 670 7 702 706 734 735
## 2 2 2 2 2 2 2 2 2 2 2
## 795 797 814 816 851 852 862 866 868 911 912
## 2 2 2 2 2 2 2 2 2 2 2
## 918 919 948 956 966 E813 E817 E919 E920 E924 E946
## 2 2 2 2 2 2 2 2 2 2 2
## E980 V02 V11 V25 V53 V70 156 163 172 179 182
## 2 2 2 2 2 2 3 3 3 3 3
## 214 245 259 299 334 335 336 347 373 379 417
## 3 3 3 3 3 3 3 3 3 3 3
## 452 470 495 5 506 525 565 619 655 661 685
## 3 3 3 3 3 3 3 3 3 3 3
## 703 720 78 826 842 865 910 913 915 921 933
## 3 3 3 3 3 3 3 3 3 3 3
## E858 E887 E905 E916 E917 E937 E956 V03 V60 V86 155
## 3 3 3 3 3 3 3 3 3 3 4
## 170 191 233 235 239 240 250.91 260 270 35 359
## 4 4 4 4 4 4 4 4 4 4 4
## 472 481 510 528 543 566 579 604 653 665 742
## 4 4 4 4 4 4 4 4 4 4 4
## 758 831 847 850 860 891 9 906 908 909 934
## 4 4 4 4 4 4 4 4 4 4 4
## E818 E850 E906 E936 E941 V23 V61 200 246 256 306
## 4 4 4 4 4 4 4 5 5 5 5
## 34 366 372 383 388 420 436 454 464 516 527
## 5 5 5 5 5 5 5 5 5 5 5
## 623 644 658 704 712 741 751 752 815 945 962
## 5 5 5 5 5 5 5 5 5 5 5
## 965 991 E927 E938 V13 V66 117 131 151 225 227
## 5 5 5 5 5 5 6 6 6 6 6
## 228 250.2 250.22 297 380 431 432 557 570 605 660
## 6 6 6 6 6 6 6 6 6 6 6
## 701 705 756 810 821 845 867 881 882 917 E870
## 6 6 6 6 6 6 6 6 6 6 6
## E929 V55 V57 150 183 201 251 279 354 451 582
## 6 6 6 7 7 7 7 7 7 7 7
## 594 610 652 686 708 713 717 746 796 825 836
## 7 7 7 7 7 7 7 7 7 7 7
## 892 V16 208 273 283 310 312 405 494 501 514
## 7 7 8 8 8 8 8 8 8 8 8
## 649 680 694 695 718 747 793 823 861 883 958
## 8 8 8 8 8 8 8 8 8 8 8
## 969 E819 E881 E928 V72 241 250.13 250.9 250.93 307 318
## 8 8 8 8 8 9 9 9 9 9 9
## 389 580 598 607 698 753 907 E816 261 351 382
## 9 9 9 9 9 9 9 9 10 10 10
## 522 586 608 647 656 791 820 V63 154 266 317
## 10 10 10 10 10 10 10 10 11 11 11
## 378 394 487 521 54 611 621 692 709 725 808
## 11 11 11 11 11 11 11 11 11 11 11
## 840 916 E931 V14 188 250.11 343 556 654 693 792
## 11 11 11 11 12 12 12 12 12 12 12
## 959 138 205 349 355 42 53 552 646 663 922
## 12 13 13 13 13 13 13 13 13 13 13
## 923 E933 E934 V08 110 358 442 462 533 537 664
## 13 13 13 13 14 14 14 14 14 14 14
## 711 905 E930 E939 E950 365 398 477 627 745 E944
## 14 14 14 14 14 15 15 15 15 15 15
## V18 157 250.1 250.83 356 386 461 588 601 737 802
## 15 16 16 16 16 16 16 16 16 16 16
## E884 369 457 532 738 813 94 189 252 555 567
## 16 17 17 17 17 17 17 18 18 18 18
## E812 281 736 824 920 999 V65 346 415 423 723
## 18 19 19 19 19 19 19 20 20 20 20
## V44 V62 446 696 805 V17 199 250.53 319 512 620
## 20 20 21 21 21 21 22 22 22 22 22
## 812 E880 298 345 614 E935 333 338 456 617 626
## 22 22 23 23 23 23 24 24 24 24 24
## 807 E947 447 517 519 572 616 659 727 238 250.81
## 24 24 25 25 25 25 25 25 25 26 26
## 277 289 250.12 282 478 618 924 V49 291 482 642
## 26 26 27 28 28 28 28 28 29 29 29
## 153 174 368 726 590 250.43 309 V27 253 434 444
## 30 30 30 30 31 32 32 32 33 34 34
## 575 576 79 E942 V46 V64 595 E879 441 573 873
## 34 34 34 34 34 34 35 35 36 36 36
## 203 185 292 783 340 344 490 507 V54 435 721
## 37 38 38 38 39 39 39 39 39 40 40
## 531 348 473 255 581 568 135 429 625 204 218
## 42 43 43 44 44 45 46 46 46 48 48
## 274 327 592 E932 465 293 242 362 600 583 301
## 49 50 50 50 51 54 55 55 55 57 58
## 337 455 681 275 250.52 290 596 250.82 332 437 V09
## 58 58 58 59 60 60 62 63 63 63 64
## 714 719 250.42 250.5 716 202 196 286 V85 728 459
## 65 65 67 68 68 69 70 70 71 72 73
## 288 304 569 38 710 211 404 558 250.51 731 284
## 77 77 77 78 78 79 79 79 80 80 81
## 794 250.7 782 466 E885 577 722 648 250.03 591 162
## 84 85 85 87 90 91 92 94 95 95 97
## 250.41 515 564 781 E888 724 250.92 574 416 553 E878
## 97 97 99 99 99 101 104 106 107 107 107
## 729 397 8 433 331 453 730 342 536 410 263
## 110 112 112 113 119 123 124 125 126 128 129
## 198 296 440 443 492 733 303 790 V43 V12 562
## 130 132 135 135 135 135 138 139 139 140 143
## 112 311 715 V42 578 300 70 784 438 996 396
## 148 151 152 152 159 164 169 169 173 174 179
## 789 995 E849 V10 571 458 799 295 426 535 294
## 181 182 182 182 183 189 192 194 194 198 204
## 250.8 511 560 197 413 998 V15 785 997 250.4 593
## 209 214 216 229 236 237 239 241 241 247 251
## 412 787 788 402 287 357 280 411 486 491 V58
## 252 253 262 274 284 289 290 307 329 346 348
## 786 244 530 493 41 518 682 278 584 250.01 250.6
## 405 411 469 504 514 519 539 559 591 638 660
## 305 425 707 424 285 V45 250.02 780 585 ? 599
## 704 707 766 778 840 888 894 901 920 1224 1240
## 403 496 272 427 414 428 276 401 250
## 1258 1594 1630 2590 2646 2753 3302 6549 8980
# Same as for diagnosis 1 and 2, we will create a new attribute with larger categories for the diagnoses.
db.expl$diag_3 <- rep('?', times = nrow(db.init))
db.expl$diag_3[grepl("^0|^[^EeVv?]$|^[^EeVv][0-9]$|^1[0123][0-9]$",db.init$diag_3)] <- "001-139"
db.expl$diag_3[grepl("^1[4-9][0-9]$|^2[0-3][0-9]$",db.init$diag_3)] <- "140-239"
db.expl$diag_3[grepl("^250.*",db.init$diag_3)] <- "250.0-250.9"
db.expl$diag_3[grepl("^2[467][0-9]$|^25[1-9]$",db.init$diag_3)] <- "240-279(not250)"
db.expl$diag_3[grepl("^28[0-9]$",db.init$diag_3)] <- "280-289"
db.expl$diag_3[grepl("^29[0-9]$|^3[01][0-9]$",db.init$diag_3)] <- "290-319"
db.expl$diag_3[grepl("^3[2-8][0-9]$",db.init$diag_3)] <- "320-389"
db.expl$diag_3[grepl("^39[0-9]$|^4[0-5][0-9]$",db.init$diag_3)] <- "390-459"
db.expl$diag_3[grepl("^4[6-9][0-9]$|^5[01][0-9]$",db.init$diag_3)] <- "460-519"
db.expl$diag_3[grepl("^5[2-7][0-9]$",db.init$diag_3)] <- "520-579"
db.expl$diag_3[grepl("^5[89][0-9]$|^6[0-2][0-9]$",db.init$diag_3)] <- "580-629"
db.expl$diag_3[grepl("^6[3-7][0-9]$",db.init$diag_3)] <- "630-679"
db.expl$diag_3[grepl("^6[89][0-9]$|^70[0-9]$",db.init$diag_3)] <- "680-709"
db.expl$diag_3[grepl("^7[1-3][0-9]$",db.init$diag_3)] <- "710-739"
db.expl$diag_3[grepl("^7[45][0-9]$",db.init$diag_3)] <- "740-759"
db.expl$diag_3[grepl("^7[67][0-9]$",db.init$diag_3)] <- "760-779"
db.expl$diag_3[grepl("^7[89][0-9]$",db.init$diag_3)] <- "780-799"
db.expl$diag_3[grepl("^[89][0-9]{2}$",db.init$diag_3)] <- "800-999"
db.expl$diag_3[grepl("^[Vv]",db.init$diag_3)] <- "V01-V91"
db.expl$diag_3[grepl("^[Ee]",db.init$diag_3)] <- "E000-E999"
db.expl$diag_3 <- as.factor(db.expl$diag_3) # Convert from character to factor
levels(db.expl$diag_3) # Now has 20 levels
## [1] "?" "001-139" "140-239" "240-279(not250)"
## [5] "250.0-250.9" "280-289" "290-319" "320-389"
## [9] "390-459" "460-519" "520-579" "580-629"
## [13] "630-679" "680-709" "710-739" "740-759"
## [17] "780-799" "800-999" "E000-E999" "V01-V91"
sort(prop.table(table(db.expl$diag_3))) # Largest group of people had a tertiary diagnosis of a circulatory disorder (29.5%); 1.8% missing values, will be imputed to majority class (390-459)
##
## 740-759 630-679 E000-E999 140-239 ?
## 0.001057551 0.003901505 0.013290841 0.016363454 0.017506753
## 001-139 320-389 710-739 800-999 680-709
## 0.017563918 0.017635374 0.019550398 0.020136338 0.021365384
## 280-289 290-319 520-579 V01-V91 780-799
## 0.024623783 0.030640390 0.034956340 0.036957112 0.044159890
## 580-629 460-519 240-279(not250) 250.0-250.9 390-459
## 0.054092293 0.060666257 0.091506724 0.179297729 0.294727967
ggplot(db.expl, aes(fct_infreq(diag_3))) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Tertiary Diagnosis', y = 'Count') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Visualize groups
# Want to reduce the levels further; will group anything under 5% into an "Other" category
diag3levels <- levels(db.expl$diag_3)[c(2,3,6:8,11,13:20)]
db.expl$diag_3 <- droplevels(fct_collapse(db.expl$diag_3, '390-459' = c('390-459', '?'), 'Other' = diag3levels)) # Drop empty levels, and collapse categories - kept the categories over 5%; put rest in "Other"
levels(db.expl$diag_3) #Reduced to 6 levels
## [1] "390-459" "Other" "240-279(not250)" "250.0-250.9"
## [5] "460-519" "580-629"
table(db.expl$diag_3)
##
## 390-459 Other 240-279(not250) 250.0-250.9 460-519
## 21848 21146 6403 12546 4245
## 580-629
## 3785
sort(prop.table(table(db.expl$diag_3))) # Other category contains 30.2% of cases
##
## 580-629 460-519 240-279(not250) 250.0-250.9 Other
## 0.05409229 0.06066626 0.09150672 0.17929773 0.30220228
## 390-459
## 0.31223472
Individual Medication Attributes
for(i in 25:47){
x <- colnames(db.init[i])
y <- sort(table(db.init[i]))
z <- sort(prop.table(table(db.init[i])))
print(c(x, y, z))
} # Review the name, frequency, and relative frequency for each individual medication. Each attribute has four levels: Down, Up, Steady, and No
## Down Up
## "metformin" "435" "834"
## Steady No Down
## "13634" "55070" "0.0062166835779515"
## Up Steady No
## "0.0119188829977277" "0.194846583682277" "0.787017849742043"
## Down Up
## "repaglinide" "28" "71"
## Steady No Down
## "818" "69056" "0.000400154345247453"
## Up Steady No
## "0.00101467708973461" "0.011690223371872" "0.986894945193146"
## Down Up
## "nateglinide" "8" "16"
## Steady No Down
## "467" "69482" "0.000114329812927844"
## Up Steady No
## "0.000228659625855687" "0.00667400282966287" "0.992983007731554"
## Down Up
## "chlorpropamide" "1" "4"
## Steady No Down
## "66" "69902" "1.42912266159804e-05"
## Up Steady No
## "5.71649064639218e-05" "0.00094322095665471" "0.998985322910265"
## Down Up
## "glimepiride" "136" "230"
## Steady No Down
## "3331" "66276" "0.00194360681977334"
## Up Steady No
## "0.0032869821216755" "0.0476040758578309" "0.94716533520072"
## Steady No
## "acetohexamide" "1" "69972"
## Steady No
## "1.42912266159804e-05" "0.999985708773384"
## Down Up
## "glipizide" "371" "573"
## Steady No Down
## "8063" "60966" "0.00530204507452875"
## Up Steady No
## "0.0081888728509568" "0.11523016020465" "0.871278921869864"
## Down Up
## "glyburide" "418" "613"
## Steady No Down
## "6744" "62198" "0.00597373272547983"
## Up Steady No
## "0.00876052191559602" "0.0963800322981722" "0.888885713060752"
## Steady No
## "tolbutamide" "17" "69956"
## Steady No
## "0.000242950852471668" "0.999757049147528"
## Down Up
## "pioglitazone" "81" "178"
## Steady No Down
## "5004" "64710" "0.00115758935589442"
## Up Steady No
## "0.00254383833764452" "0.0715132979863662" "0.924785274320095"
## Down Up
## "rosiglitazone" "74" "132"
## Steady No Down
## "4455" "65312" "0.00105755076958255"
## Up Steady No
## "0.00188644191330942" "0.0636674145741929" "0.933388592742915"
## Down Up
## "acarbose" "0" "10"
## Steady No Down
## "190" "69773" "0"
## Up Steady No
## "0.000142912266159804" "0.00271533305703629" "0.997141754676804"
## Down Up
## "miglitol" "1" "1"
## Steady No Down
## "18" "69953" "1.42912266159804e-05"
## Up Steady No
## "1.42912266159804e-05" "0.000257242079087648" "0.99971417546768"
## Steady No
## "troglitazone" "3" "69970"
## Steady No
## "4.28736798479414e-05" "0.999957126320152"
## Up Steady
## "tolazamide" "0" "30"
## No Up Steady
## "69943" "0" "0.000428736798479413"
## No
## "0.999571263201521"
## No No
## "examide" "69973" "1"
## No No
## "citoglipton" "69973" "1"
## Up Down
## "insulin" "6777" "7321"
## Steady No Up
## "21617" "34258" "0.0968516427764995"
## Down Steady No
## "0.104626070055593" "0.308933445757649" "0.489588841410258"
## Down Up
## "glyburide.metformin" "4" "7"
## Steady No Down
## "485" "69477" "5.71649064639218e-05"
## Up Steady No
## "0.000100038586311863" "0.00693124490875052" "0.992911551598474"
## Steady No
## "glipizide.metformin" "7" "69966"
## Steady No
## "0.000100038586311863" "0.999899961413688"
## Steady
## "glimepiride.pioglitazone" "0"
## No Steady
## "69973" "0"
## No
## "1"
## Steady No
## "metformin.rosiglitazone" "2" "69971"
## Steady No
## "2.85824532319609e-05" "0.999971417546768"
## Steady No
## "metformin.pioglitazone" "1" "69972"
## Steady No
## "1.42912266159804e-05" "0.999985708773384"
# Any attribute with one category encompassing 95% of the records or greater will be removed for modeling due to low variance. This includes 16 of the 23 medications: Repaglinide, Nateglinide, Chlorpropamide, Acetohexamide, Tolbutamide, Acarbose, Miglitol, Troglitazone, Tolazamide, Examide, Citoglipton, Glyburide-metformin, Glipizide-metformin, Glimepiride-pioglitazone, Metformin-rosiglitazone, and Metformin-pioglitazone
# Add medication attributes to keep for exploratory analysis to db.expl
db.expl$metformin <- db.init$metformin
db.expl$glimepiride <- db.init$glimepiride
db.expl$glipizide <- db.init$glipizide
db.expl$glyburide <- db.init$glyburide
db.expl$pioglitazone <- db.init$pioglitazone
db.expl$rosiglitazone <- db.init$rosiglitazone
db.expl$insulin <- db.init$insulin
Change in Medication
table(db.init$change)
##
## Ch No
## 31491 38482
sort(prop.table(table(db.init$change))) # The two categories are pretty evenly split. 45.0% had a change in diabetic medication and 55.0% did not
##
## Ch No
## 0.450045 0.549955
db.expl$change <- db.init$change # Add to exploration data frame
Diabtes Medication
table(db.init$diabetesMed)
##
## No Yes
## 16680 53293
sort(prop.table(table(db.init$diabetesMed))) #76.2% of patients are taking a diabetic medication; 23.8% are not
##
## No Yes
## 0.2383777 0.7616223
db.expl$diabetesMed <- db.init$diabetesMed # Add to exploration data frame
table(db.init$readmitted) # There are currently 3 levels: readmitted within 30 days, readmitted after 30 days, and no readmission.
##
## <30 >30 NO
## 6277 22222 41474
# '>30' and 'NO' will be grouped together, as the purpose of analysis is to compare patients readmitted within 30 days to those who were not
db.expl$readmitted <- fct_collapse(db.init$readmitted, 'Not<30' = c('>30', 'NO'))
table(db.expl$readmitted)
##
## <30 Not<30
## 6277 63696
sort(prop.table(table(db.expl$readmitted))) # The target attribute is imbalanced, with only 9.0% of the patients having been readmitted within 30 days, whereas 91.0% were not
##
## <30 Not<30
## 0.08970603 0.91029397
Now that we have condensed categories for certain variables, and removed attributes that were irrelevant, had a large number of missing values, or had low variance, let’s review the final structure of the dataset to be used for exploratory analysis (db.expl).
str(db.expl) # We now have a dataframe with 28 variables and 69973 records ready for further exploratory analysis
## 'data.frame': 69973 obs. of 28 variables:
## $ time_in_hospital : int 13 12 1 9 3 7 7 10 4 1 ...
## $ num_lab_procedures : int 68 33 51 47 31 62 60 55 70 49 ...
## $ num_procedures : int 2 3 0 2 6 0 0 1 1 5 ...
## $ num_medications : int 28 18 8 17 16 11 15 31 21 2 ...
## $ number_outpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : int 0 0 0 0 0 0 1 0 0 0 ...
## $ number_inpatient : int 0 0 0 0 0 0 0 0 0 0 ...
## $ number_diagnoses : int 8 8 5 9 9 7 8 8 7 8 ...
## $ age : Factor w/ 10 levels "[0-10)","[10-20)",..: 9 10 5 5 6 7 5 9 7 7 ...
## $ A1Cresult : Factor w/ 3 levels ">7","None","Norm": 2 2 2 2 2 2 2 2 2 2 ...
## $ race : Factor w/ 5 levels "Caucasian","AfricanAmerican",..: 1 1 1 2 1 2 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 2 2 1 ...
## $ admission_type_id : Factor w/ 4 levels "Emergency","Urgent",..: 2 3 1 1 2 2 1 1 3 3 ...
## $ discharge_disposition_id: Factor w/ 5 levels "DcHome","DcOtherFacility",..: 1 3 1 1 1 1 3 4 1 1 ...
## $ admission_source_id : Factor w/ 4 levels "PhysRef","TransferExtFacility",..: 2 2 3 3 2 2 3 3 2 2 ...
## $ diag_1 : Factor w/ 8 levels "390-459","Other",..: 1 1 2 3 1 2 1 1 1 4 ...
## $ diag_2 : Factor w/ 6 levels "390-459","Other",..: 1 2 2 1 1 2 4 1 1 2 ...
## $ diag_3 : Factor w/ 6 levels "390-459","Other",..: 2 5 4 2 4 2 4 1 2 6 ...
## $ metformin : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 3 2 3 2 ...
## $ glimepiride : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 3 2 ...
## $ glipizide : Factor w/ 4 levels "Down","No","Steady",..: 3 2 3 2 2 2 2 2 2 2 ...
## $ glyburide : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 4 2 2 2 2 ...
## $ pioglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ rosiglitazone : Factor w/ 4 levels "Down","No","Steady",..: 2 3 2 2 2 2 2 2 2 2 ...
## $ insulin : Factor w/ 4 levels "Down","No","Steady",..: 3 3 3 3 3 3 1 3 3 3 ...
## $ change : Factor w/ 2 levels "Ch","No": 1 1 1 2 2 1 1 2 1 2 ...
## $ diabetesMed : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ readmitted : Factor w/ 2 levels "<30","Not<30": 2 2 2 2 2 1 1 2 2 2 ...
#write.csv(db.expl, '/Users/amyhowe/Desktop/Dataset_Exploration.csv', row.names = F) # Export dataset if desired
Let’s take a look at the correlations of numeric and ordinal attributes. Given that there will be one ordinal attribute (Age) to compare in the mix, as well as the fact that there are numerous outliers in the numeric data, we will be using Spearman’s correlation which uses ranks.
rcorr(as.matrix(cbind(db.expl[,c(1:8)], age = as.numeric(db.expl$age))), type = 'spearman') # No strong monotonic correlations. There are moderate correlations between number of medications and time in hospital (0.46), number of medications and number of procedures (0.37), and number of lab procedures and time in hospital (0.35). The rest are weak correlations. There is 0 correlation between time in hospital and number of emergency visits, but this is NOT statistically significant (p = 0.283). The same applies for number of procedures and number of lab procedures (p = 0.309). The rest of the findings are highly significant (p<0.01)
## time_in_hospital num_lab_procedures num_procedures
## time_in_hospital 1.00 0.35 0.18
## num_lab_procedures 0.35 1.00 0.00
## num_procedures 0.18 0.00 1.00
## num_medications 0.46 0.24 0.37
## number_outpatient -0.02 -0.02 -0.01
## number_emergency 0.00 0.02 -0.04
## number_inpatient 0.07 0.08 -0.01
## number_diagnoses 0.25 0.17 0.07
## age 0.14 0.03 -0.05
## num_medications number_outpatient number_emergency
## time_in_hospital 0.46 -0.02 0.00
## num_lab_procedures 0.24 -0.02 0.02
## num_procedures 0.37 -0.01 -0.04
## num_medications 1.00 0.05 0.02
## number_outpatient 0.05 1.00 0.15
## number_emergency 0.02 0.15 1.00
## number_inpatient 0.06 0.10 0.14
## number_diagnoses 0.29 0.09 0.07
## age 0.04 0.03 -0.03
## number_inpatient number_diagnoses age
## time_in_hospital 0.07 0.25 0.14
## num_lab_procedures 0.08 0.17 0.03
## num_procedures -0.01 0.07 -0.05
## num_medications 0.06 0.29 0.04
## number_outpatient 0.10 0.09 0.03
## number_emergency 0.14 0.07 -0.03
## number_inpatient 1.00 0.08 0.03
## number_diagnoses 0.08 1.00 0.21
## age 0.03 0.21 1.00
##
## n= 69973
##
##
## P
## time_in_hospital num_lab_procedures num_procedures
## time_in_hospital 0.0000 0.0000
## num_lab_procedures 0.0000 0.3090
## num_procedures 0.0000 0.3090
## num_medications 0.0000 0.0000 0.0000
## number_outpatient 0.0000 0.0000 0.0012
## number_emergency 0.2831 0.0000 0.0000
## number_inpatient 0.0000 0.0000 0.0160
## number_diagnoses 0.0000 0.0000 0.0000
## age 0.0000 0.0000 0.0000
## num_medications number_outpatient number_emergency
## time_in_hospital 0.0000 0.0000 0.2831
## num_lab_procedures 0.0000 0.0000 0.0000
## num_procedures 0.0000 0.0012 0.0000
## num_medications 0.0000 0.0000
## number_outpatient 0.0000 0.0000
## number_emergency 0.0000 0.0000
## number_inpatient 0.0000 0.0000 0.0000
## number_diagnoses 0.0000 0.0000 0.0000
## age 0.0000 0.0000 0.0000
## number_inpatient number_diagnoses age
## time_in_hospital 0.0000 0.0000 0.0000
## num_lab_procedures 0.0000 0.0000 0.0000
## num_procedures 0.0160 0.0000 0.0000
## num_medications 0.0000 0.0000 0.0000
## number_outpatient 0.0000 0.0000 0.0000
## number_emergency 0.0000 0.0000 0.0000
## number_inpatient 0.0000 0.0000
## number_diagnoses 0.0000 0.0000
## age 0.0000 0.0000
corr <- round(cor(cbind(db.expl[,c(1:8)], age = as.numeric(db.expl$age)), method = 'spearman'), 2)
ggcorrplot(corr, hc.order = TRUE, type = "lower", lab = TRUE) # Visualize the correlation heatmap
Let’s take a look at the scatterplots for each combination of numeric variables.
pairs(db.expl[,1:8], pch = 20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1))
Let’s focus in on the scatterplots for the three combinations we found to have the highest Spearman’s r correlation coefficient.
plot(jitter(db.expl$time_in_hospital,2), db.expl$num_medications, pch=20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1), xlab = 'Time in Hospital', ylab = 'No. Medications') # Added transparency, as well as jitter along the x-axis to better see the trend. As the time in hospital increases, the number of medications appears to increase at a linear rate. However, there appears to be a lot of noise as well
plot(jitter(db.expl$num_procedures,2), db.expl$num_medications, pch=20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1), xlab = 'No. Procedures', ylab = 'No. Medications') # Jitter added along the x-axis as well. It appears as if as the number of procedures increase, the number of medications increase as well at a linear rate. However, there is more noise as the number of procedures increases as well
plot(jitter(db.expl$time_in_hospital,2), db.expl$num_lab_procedures, pch=20, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.1), xlab = 'Time in Hospital', ylab = 'No. Lab Procedures') # Jitter present for x-axis here too. As the time in hospital increases, the number of lab procedures increase. However, the relationship may be modeled better with a curve than with a line.
We will compare each variable against the target variable (readmitted) to identify differences between the two groups: patients who were readmitted within 30 days, and patients who weren’t.
For numeric variables, the count, mean, standard deviation, and boxplots will be compared between both groups. Additionally, a sample t-test will be completed to test for a statistically significant difference between the two groups.
Time in Hospital VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(time_in_hospital), sd = sd(time_in_hospital)) # The mean for time in hospital for the <30 group is slightly higher than the not readmitted group, with a slightly higher standard deviation as well.
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 4.80 3.06
## 2 Not<30 63696 4.22 2.92
boxplot(time_in_hospital~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Time in Hospital", ylab = "Readmission Status") # Visualize the two groups
# Test whether the mean of the <30 group is significantly higher than the Not<30 group; no need to test for normality as sample sizes are large (central limit theorem). First check to see if the variances are equal using an f-test to inform the t-test arguments
var.test(time_in_hospital ~ readmitted, data = db.expl) # Variances are not equal (p=2.644e-07)
##
## F test to compare two variances
##
## data: time_in_hospital by readmitted
## F = 1.0996, num df = 6276, denom df = 63695, p-value = 2.644e-07
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.060322 1.141018
## sample estimates:
## ratio of variances
## 1.099623
t.test(db.expl$time_in_hospital[db.expl$readmitted == "<30"], db.expl$time_in_hospital[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The finding that the mean for the <30 group is higher than the Not<30 group is highly significant (p<2.2e-16). Therefore, there is a longer average length of stay for patients who are readmitted within 30 days
##
## Welch Two Sample t-test
##
## data: db.expl$time_in_hospital[db.expl$readmitted == "<30"] and db.expl$time_in_hospital[db.expl$readmitted == "Not<30"]
## t = 14.286, df = 7445.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.5093295 Inf
## sample estimates:
## mean of x mean of y
## 4.797196 4.221584
Number of Lab Procedures VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(num_lab_procedures), sd = sd(num_lab_procedures)) # The mean for number of lab procedures for the <30 group is slightly higher than the not readmitted group, with a slightly lower standard deviation
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 44.9 19.3
## 2 Not<30 63696 42.7 19.9
boxplot(num_lab_procedures~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Lab Procedures", ylab = "Readmission Status")
var.test(num_lab_procedures ~ readmitted, data = db.expl) # Variances of the two groups are not equal (p=0.001259)
##
## F test to compare two variances
##
## data: num_lab_procedures by readmitted
## F = 0.94087, num df = 6276, denom df = 63695, p-value = 0.001259
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9072466 0.9762929
## sample estimates:
## ratio of variances
## 0.9408736
t.test(db.expl$num_lab_procedures[db.expl$readmitted == "<30"], db.expl$num_lab_procedures[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The finding that the mean for the <30 group is higher than the Not<30 group is highly significant (p<2.2e-16). Therefore, there is a higher average number of lab procedures completed for patients who are readmitted within 30 days
##
## Welch Two Sample t-test
##
## data: db.expl$num_lab_procedures[db.expl$readmitted == "<30"] and db.expl$num_lab_procedures[db.expl$readmitted == "Not<30"]
## t = 8.7323, df = 7651.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 1.818288 Inf
## sample estimates:
## mean of x mean of y
## 44.91541 42.67507
Number of Procedures VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(num_procedures), sd = sd(num_procedures)) # The means and standard deviations are very close for both groups
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 1.42 1.73
## 2 Not<30 63696 1.43 1.76
boxplot(num_procedures~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Procedures", ylab = "Readmission Status") # Distributions look identical
var.test(num_procedures ~ readmitted, data = db.expl) # Fail to reject the null hypothesis that the variances are equal (p=0.07307)
##
## F test to compare two variances
##
## data: num_procedures by readmitted
## F = 0.96678, num df = 6276, denom df = 63695, p-value = 0.07307
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9322232 1.0031703
## sample estimates:
## ratio of variances
## 0.9667759
t.test(db.expl$num_procedures[db.expl$readmitted == "<30"], db.expl$num_procedures[db.expl$readmitted == "Not<30"], alternative = 'less' , var.equal = TRUE) # Fail to reject the null hypothesis that the means are the same (p=0.4847)
##
## Two Sample t-test
##
## data: db.expl$num_procedures[db.expl$readmitted == "<30"] and db.expl$num_procedures[db.expl$readmitted == "Not<30"]
## t = -0.038297, df = 69971, p-value = 0.4847
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 0.03734585
## sample estimates:
## mean of x mean of y
## 1.424725 1.425615
Number of Medications VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(num_medications), sd = sd(num_medications)) # The mean and standard deviation of the <30 group is slightly higher
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 16.6 8.32
## 2 Not<30 63696 15.6 8.28
boxplot(num_medications~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Medications", ylab = "Readmission Status") # Distributions look similar
var.test(num_medications ~ readmitted, data = db.expl) # Fail to reject the null hypothesis that the variances are equal (p=0.5465)
##
## F test to compare two variances
##
## data: num_medications by readmitted
## F = 1.0112, num df = 6276, denom df = 63695, p-value = 0.5465
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9750895 1.0492990
## sample estimates:
## ratio of variances
## 1.011231
t.test(db.expl$num_medications[db.expl$readmitted == "<30"], db.expl$num_medications[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = TRUE) # The results are highly significant (p<2.2e-16) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of medications for those readmitted within 30 days is higher than those not readmitted within 30 days
##
## Two Sample t-test
##
## data: db.expl$num_medications[db.expl$readmitted == "<30"] and db.expl$num_medications[db.expl$readmitted == "Not<30"]
## t = 9.6309, df = 69971, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.8749603 Inf
## sample estimates:
## mean of x mean of y
## 16.62578 15.57060
Number of Outpatient Visits VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(number_outpatient), sd = sd(number_outpatient)) # The mean of the <30 group is slightly higher, while the sd is slightly lower
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 0.308 1.04
## 2 Not<30 63696 0.277 1.07
boxplot(number_outpatient~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Outpatient Visits", ylab = "Readmission Status") # Distributions look similar, with fewer outliers for <30
var.test(number_outpatient ~ readmitted, data = db.expl) # Can reject the null hypothesis at the alpha=0.05 level that the variances are equal (p=0.03272)
##
## F test to compare two variances
##
## data: number_outpatient by readmitted
## F = 0.96053, num df = 6276, denom df = 63695, p-value = 0.03272
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.9261983 0.9966869
## sample estimates:
## ratio of variances
## 0.9605277
t.test(db.expl$number_outpatient[db.expl$readmitted == "<30"], db.expl$number_outpatient[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The results are statistically significant (p=0.01096) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of outpatient visits in the last year for those readmitted within 30 days is higher than those not readmitted within 30 days.
##
## Welch Two Sample t-test
##
## data: db.expl$number_outpatient[db.expl$readmitted == "<30"] and db.expl$number_outpatient[db.expl$readmitted == "Not<30"]
## t = 2.2924, df = 7621.9, p-value = 0.01096
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.008962153 Inf
## sample estimates:
## mean of x mean of y
## 0.3084276 0.2766893
Number of Emergency Visits VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(number_emergency), sd = sd(number_emergency)) # The mean and standard deviation of the <30 group are higher
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 0.150 0.582
## 2 Not<30 63696 0.0994 0.504
boxplot(number_emergency~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Emergency Visits", ylab = "Readmission Status") # Distributions look similar, with fewer outliers for <30
var.test(number_emergency ~ readmitted, data = db.expl) # Cannot consider the variances to be equal (p<2.2e-16)
##
## F test to compare two variances
##
## data: number_emergency by readmitted
## F = 1.3333, num df = 6276, denom df = 63695, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.285695 1.383543
## sample estimates:
## ratio of variances
## 1.333349
t.test(db.expl$number_emergency[db.expl$readmitted == "<30"], db.expl$number_emergency[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The results are highly significant (p=2.016e-11) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of emergency visits in the last year for those readmitted within 30 days is higher than those not readmitted within 30 days
##
## Welch Two Sample t-test
##
## data: db.expl$number_emergency[db.expl$readmitted == "<30"] and db.expl$number_emergency[db.expl$readmitted == "Not<30"]
## t = 6.6131, df = 7234.1, p-value = 2.016e-11
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.0378318 Inf
## sample estimates:
## mean of x mean of y
## 0.1497531 0.0993940
Number of Inpatient Visits VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(number_inpatient), sd = sd(number_inpatient)) # The mean of the <30 group is over double the mean of Not<30. The sd is higher as well
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 0.369 0.983
## 2 Not<30 63696 0.157 0.546
boxplot(number_inpatient~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Inpatient Visits", ylab = "Readmission Status") # We're not able to see much here
var.test(number_inpatient ~ readmitted, data = db.expl) # Variances are not equal (p<2.2e-16)
##
## F test to compare two variances
##
## data: number_inpatient by readmitted
## F = 3.2355, num df = 6276, denom df = 63695, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 3.119891 3.357331
## sample estimates:
## ratio of variances
## 3.235529
t.test(db.expl$number_inpatient[db.expl$readmitted == "<30"], db.expl$number_inpatient[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # The findings are highly significant (p<2.2e-16) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of inpatient visits in the last year for those readmitted within 30 days is higher than those not readmitted within 30 days
##
## Welch Two Sample t-test
##
## data: db.expl$number_inpatient[db.expl$readmitted == "<30"] and db.expl$number_inpatient[db.expl$readmitted == "Not<30"]
## t = 16.799, df = 6663.5, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.1908144 Inf
## sample estimates:
## mean of x mean of y
## 0.3688068 0.1572783
Number of Diagnoses VS Readmitted
group_by(db.expl, readmitted) %>%
summarise(count = n(), mean = mean(number_diagnoses), sd = sd(number_diagnoses)) # The mean of the <30 group is higher than the Not<30 group while the sd is lower
## # A tibble: 2 x 4
## readmitted count mean sd
## <fct> <int> <dbl> <dbl>
## 1 <30 6277 7.51 1.85
## 2 Not<30 63696 7.20 2.01
boxplot(number_diagnoses~readmitted, data=db.expl, horizontal = T, col = '#83afe6', xlab = "Number of Diagnoses", ylab = "Readmission Status") # The distributions appear identical here
var.test(number_diagnoses ~ readmitted, data = db.expl) # Variances are not equal (p<2.2e-16)
##
## F test to compare two variances
##
## data: number_diagnoses by readmitted
## F = 0.84113, num df = 6276, denom df = 63695, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.8110711 0.8727979
## sample estimates:
## ratio of variances
## 0.8411333
t.test(db.expl$number_diagnoses[db.expl$readmitted == "<30"], db.expl$number_diagnoses[db.expl$readmitted == "Not<30"], alternative = "greater", var.equal = FALSE) # Highly significant (p<2.2e-16) that the mean of the <30 group is greater than the Not<30 group. Therefore, the average number of diagnoses for those readmitted within 30 days is higher than those not readmitted within 30 days
##
## Welch Two Sample t-test
##
## data: db.expl$number_diagnoses[db.expl$readmitted == "<30"] and db.expl$number_diagnoses[db.expl$readmitted == "Not<30"]
## t = 12.903, df = 7822.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.277388 Inf
## sample estimates:
## mean of x mean of y
## 7.513143 7.195224
For the one remaining ordinal variable, age, the frequencies and histograms of each group will be reviewed, and a wilcoxon rank sum test will be completed to identify if there is a statistically significant difference between the two groups.
Age VS Readmitted
table(db.expl$age, db.expl$readmitted) # Values appear to peak similarly in the 70-80 range
##
## <30 Not<30
## [0-10) 3 150
## [10-20) 26 508
## [20-30) 83 1038
## [30-40) 188 2504
## [40-50) 506 6322
## [50-60) 879 11470
## [60-70) 1412 14272
## [70-80) 1817 15933
## [80-90) 1195 9907
## [90-100) 168 1592
group_by(db.expl, readmitted) %>%
summarise(count = n(), median = median(as.numeric(age))) # The median of the <30 group is 8 ([70-80)), while the median of the Not <30 group is 7 ([60-70))
## # A tibble: 2 x 3
## readmitted count median
## <fct> <int> <dbl>
## 1 <30 6277 8
## 2 Not<30 63696 7
ggplot(subset(db.expl, db.expl$readmitted == "<30"), aes(age)) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Age in Readmitted Group', y = 'Count')
ggplot(subset(db.expl, db.expl$readmitted == "Not<30"), aes(age)) + geom_bar(color = 'black', fill = '#83afe6') + theme_bw() + labs(x = 'Age in Not Readmitted Group', y = 'Count') # The distribution of the readmitted in <30 days group appears to have a bit more weight to the right than the other group
age.numeric <- as.numeric(db.expl$age) # Convert Age to its ordered IDs (1-10) so that it may be used with the wilcoxon rank sum test function
wilcox.test(age.numeric~db.expl$readmitted, alternative = 'greater', paired = F) # Findings are highly significant (p<2.2e-16) that the center of the <30 group is greater than the center of the Not<30 group. Therefore, patients who have been readmitted within 30 days are generally older
##
## Wilcoxon rank sum test with continuity correction
##
## data: age.numeric by db.expl$readmitted
## W = 219034815, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0
First we will define a custom function to be used to calculate the percent contribution of each cell of a table to the chi-squared test statistic to identify combinations of categories with the highest contributions.
percent_contrib <- function(x) {
y <- 100*x$residuals^2/x$statistic
return(round(y, 3))
}
For each categorical variable, we will review the frequencies for each readmission group, complete a chi-squared test to identify if the groups have a statistically significant difference, and then, if the findings are statistically significant, review the percent contribution and residuals of cells to identify associations between categories of the variables.
Race VS Readmitted
table(db.expl$race, db.expl$readmitted)
##
## <30 Not<30
## Caucasian 4942 49268
## AfricanAmerican 1093 11532
## Asian 41 447
## Hispanic 122 1378
## Other 79 1071
race_readm.chisq <- chisq.test(table(db.expl$race, db.expl$readmitted))
race_readm.chisq # The results are statistically significant at the alpha = 0.05 level (p=0.0311), indicating that there is a statistically significant association between the two variables (i.e., they are not independent)
##
## Pearson's Chi-squared test
##
## data: table(db.expl$race, db.expl$readmitted)
## X-squared = 10.625, df = 4, p-value = 0.03111
percent_contrib(race_readm.chisq) # The percent contribution (100*residual^2/chi-squared) indicates that the number of patients who identified their race as "Other" and were readmitted within 30 days contributed most to the total chi-square score (53.3%). Caucasian (12.1%), African American (13.0%), and Hispanic (11.0%) with <30 each contributed similarly.
##
## <30 Not<30
## Caucasian 12.089 1.191
## AfricanAmerican 12.991 1.280
## Asian 1.657 0.163
## Hispanic 11.032 1.087
## Other 53.260 5.249
round(race_readm.chisq$residuals,2) # We can see that since the residual is negative for the Other, <30 cell, there were fewer people observed than expected, indicating a negative association. Otherwise, we can see from the residuals that there was a positive association for Caucasian and <30, and a negative association for Aftrican American and Hispanic with <30. I.e., more Caucasian patients were readmitted within 30 days than expected, and fewer African American, Hispanic, and "Other" patients were. This could make sense given that in the U.S., ethnic minorities are more likely to be of a lower socioeconomic status, and therefore are less likely to have health insurance and feel comfortable seeking care
##
## <30 Not<30
## Caucasian 1.13 -0.36
## AfricanAmerican -1.17 0.37
## Asian -0.42 0.13
## Hispanic -1.08 0.34
## Other -2.38 0.75
Gender VS Readmitted
table(db.expl$gender, db.expl$readmitted)
##
## <30 Not<30
## Female 3361 33871
## Male 2916 29825
chisq.test(table(db.expl$gender, db.expl$readmitted)) # The chi-squared test is not statistically significant (p=0.5856), indicating that we cannot reject the null hypothesis that the two variables are independent
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(db.expl$gender, db.expl$readmitted)
## X-squared = 0.29729, df = 1, p-value = 0.5856
Admission Type ID VS Readmitted
table(db.expl$admission_type_id, db.expl$readmitted)
##
## <30 Not<30
## Emergency 3987 39372
## Urgent 1148 11654
## Elective 1141 12644
## Other 1 26
admtype_readm.chisq <- chisq.test(table(db.expl$admission_type_id, db.expl$readmitted))
admtype_readm.chisq # The results are highly significant (p=0.008417), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$admission_type_id, db.expl$readmitted)
## X-squared = 11.717, df = 3, p-value = 0.008417
percent_contrib(admtype_readm.chisq) # The percent contribution indicates that the number of patients who were readmitted within 30 days for elective reasons contributed the most (63.1%), with those readmitted within 30 days through emergency contributing the next most (20.8%)
##
## <30 Not<30
## Emergency 20.831 2.053
## Urgent 0.001 0.000
## Elective 63.072 6.215
## Other 7.126 0.702
round(admtype_readm.chisq$residuals,2) # The residual is negative for <30, Elective, indicating that the number seen is less than expected. The residual is positive for <30, Emergency, indicating more people were admitted through emergency than expected. These findings suggest that if you are admitted to hospital for planned reasons, you are less likely to be readmitted within 30 days
##
## <30 Not<30
## Emergency 1.56 -0.49
## Urgent -0.01 0.00
## Elective -2.72 0.85
## Other -0.91 0.29
Discharge Disposition ID VS Readmitted
table(db.expl$discharge_disposition_id, db.expl$readmitted)
##
## <30 Not<30
## DcHome 3377 44192
## DcOtherFacility 868 3919
## DcSNF 1176 7608
## DcHomeWithHomeService 796 7566
## Other 60 411
dcid_readm.chisq <- chisq.test(table(db.expl$discharge_disposition_id, db.expl$readmitted))
dcid_readm.chisq # The results are highly significant (p<2.2e-16), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$discharge_disposition_id, db.expl$readmitted)
## X-squared = 917.26, df = 4, p-value < 2.2e-16
percent_contrib(dcid_readm.chisq) # DcOtherFacility, <30 has the highest percent contribution (48.8%) with DcSNF (20.8%) and DcHome (20.3%) following
##
## <30 Not<30
## DcHome 20.247 1.995
## DcOtherFacility 48.833 4.812
## DcSNF 20.831 2.053
## DcHomeWithHomeService 0.306 0.030
## Other 0.813 0.080
round(dcid_readm.chisq$residuals,2) # There is a negative association between DcHome and <30, and a positive association between each DcOtherFacility and DcSNF with <30. This indicates that in the <30 days readmitted group, there were fewer people than expected discharged home, with more people than expected discharged to a Skilled Nursing Facility or other facility
##
## <30 Not<30
## DcHome -13.63 4.28
## DcOtherFacility 21.16 -6.64
## DcSNF 13.82 -4.34
## DcHomeWithHomeService 1.68 -0.53
## Other 2.73 -0.86
Admission Source ID VS Readmitted
table(db.expl$admission_source_id, db.expl$readmitted)
##
## <30 Not<30
## PhysRef 1871 19875
## TransferExtFacility 508 5372
## ER 3897 38431
## Other 1 18
admsource_readm.chisq <- chisq.test(table(db.expl$admission_source_id, db.expl$readmitted))
admsource_readm.chisq # The results are not statistically significant at the alpha = 0.05 level (p=0.0555), although they are tending towards significance
##
## Pearson's Chi-squared test
##
## data: table(db.expl$admission_source_id, db.expl$readmitted)
## X-squared = 7.5795, df = 3, p-value = 0.05555
Diagnosis 1 VS Readmitted
table(db.expl$diag_1, db.expl$readmitted)
##
## <30 Not<30
## 390-459 2067 19259
## Other 1457 14410
## 250.0-250.9 524 5224
## 460-519 536 5910
## 520-579 502 5823
## 710-739 341 3723
## 780-799 344 5159
## 800-999 506 4188
d1_readm.chisq <- chisq.test(table(db.expl$diag_1, db.expl$readmitted))
d1_readm.chisq # The results are highly significant (p<2.2e-16), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$diag_1, db.expl$readmitted)
## X-squared = 96.623, df = 7, p-value < 2.2e-16
percent_contrib(d1_readm.chisq) # 780-799 (Symptoms, Signs, And Ill-Defined Conditions), <30 has the highest percent contribution (47.0%), with 800-999 (Injury And Poisoning), <30 (17.7%) and 390-459 (Diseases Of The Circulatory System), <30 (12.8%) following
##
## <30 Not<30
## 390-459 12.818 1.263
## Other 0.823 0.081
## 250.0-250.9 0.141 0.014
## 460-519 3.194 0.315
## 520-579 7.800 0.769
## 710-739 1.576 0.155
## 780-799 46.953 4.627
## 800-999 17.724 1.747
round(d1_readm.chisq$residuals,2) # There is a negative association between 780-799 and <30, and a positive association for each 800-999 and 390-459 with <30. Therefore, in the readmitted within 30 days group, there are fewer patients than expected with a primary diagnosis of Symptoms, Signs, And Ill-Defined Conditions, and more patients than expected with Injury and Poisoning, or a circulatory disorder
##
## <30 Not<30
## 390-459 3.52 -1.10
## Other 0.89 -0.28
## 250.0-250.9 0.37 -0.12
## 460-519 -1.76 0.55
## 520-579 -2.75 0.86
## 710-739 -1.23 0.39
## 780-799 -6.74 2.11
## 800-999 4.14 -1.30
Diagnosis 2 VS Readmitted
table(db.expl$diag_2, db.expl$readmitted)
##
## <30 Not<30
## 390-459 2020 20055
## Other 1965 19141
## 240-279(not250) 456 5149
## 250.0-250.9 827 8873
## 460-519 555 5890
## 580-629 454 4588
d2_readm.chisq <- chisq.test(table(db.expl$diag_2, db.expl$readmitted))
d2_readm.chisq # The results are statistically significant at the alpha = 0.05 level (p=0.03454), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$diag_2, db.expl$readmitted)
## X-squared = 12.018, df = 5, p-value = 0.03454
percent_contrib(d2_readm.chisq) # 240-279(not250) (Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders) with <30 has the highest percent contribution (36.3%), with Other, <30 (22.6%) and 250.0-250.0 (Diabetes Mellitus), <30 (17.8%) following
##
## <30 Not<30
## 390-459 6.636 0.654
## Other 22.571 2.224
## 240-279(not250) 36.250 3.572
## 250.0-250.9 17.803 1.754
## 460-519 7.717 0.760
## 580-629 0.053 0.005
round(d2_readm.chisq$residuals,2) # There are fewer patients than expected in those who are readmitted with a secondary diagnosis of Diabetes Mellitus or Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders, and more than expected with "Other"
##
## <30 Not<30
## 390-459 0.89 -0.28
## Other 1.65 -0.52
## 240-279(not250) -2.09 0.66
## 250.0-250.9 -1.46 0.46
## 460-519 -0.96 0.30
## 580-629 0.08 -0.03
Diagnosis 3 VS Readmitted
table(db.expl$diag_3, db.expl$readmitted)
##
## <30 Not<30
## 390-459 1899 19949
## Other 1979 19167
## 240-279(not250) 502 5901
## 250.0-250.9 1079 11467
## 460-519 444 3801
## 580-629 374 3411
d3_readm.chisq <- chisq.test(table(db.expl$diag_3, db.expl$readmitted))
d3_readm.chisq # The results are highly significant (p=3.032e-06), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$diag_3, db.expl$readmitted)
## X-squared = 33.472, df = 5, p-value = 3.032e-06
percent_contrib(d3_readm.chisq) # 460-519 (Diseases Of The Respiratory System) with <30 has the highest percent contribution (31.3%), with 240-279(not250) (Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders), <30 (27.3%) following
##
## <30 Not<30
## 390-459 5.653 0.557
## Other 10.610 1.046
## 240-279(not250) 27.254 2.686
## 250.0-250.9 5.728 0.564
## 460-519 31.334 3.088
## 580-629 10.450 1.030
round(d3_readm.chisq$residuals,2) # There are more patients than expected in those who are readmitted with a tertiary diagnosis of Diseases Of The Respiratory System, and fewer than expected with Endocrine, Nutritional And Metabolic Diseases, And Immunity Disorders
##
## <30 Not<30
## 390-459 -1.38 0.43
## Other 1.88 -0.59
## 240-279(not250) -3.02 0.95
## 250.0-250.9 -1.38 0.43
## 460-519 3.24 -1.02
## 580-629 1.87 -0.59
A1C Result VS Readmitted
table(db.expl$A1Cresult, db.expl$readmitted)
##
## <30 Not<30
## >7 755 8349
## None 5199 51929
## Norm 323 3418
A1C_readm.chisq <- chisq.test(table(db.expl$A1Cresult, db.expl$readmitted))
A1C_readm.chisq # The results are statistically significant at the alpha = 0.05 level (p=0.03305), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$A1Cresult, db.expl$readmitted)
## X-squared = 6.8195, df = 2, p-value = 0.03305
percent_contrib(A1C_readm.chisq) # >7 (abnormal HbA1c result) and <30 has the highest percent contribution (68.3%)
##
## <30 Not<30
## >7 68.318 6.732
## None 15.785 1.556
## Norm 6.926 0.683
round(A1C_readm.chisq$residuals,2) # There is a negative association between >7 and <30, indicating that patients who were readmitted within 30 days were less likely than expected to have an abnormal HbA1c result. This is very interesting. It appears that having an abnormal result may be a protective factor to getting readmitted. This would likely be because as a result of the abnormal finding, some change was made to the patient's treatment regimen leading to better controlled diabetes and a reduction in complications. It is noteworthy however that 81.6% of cases had no HbA1c test done in hospital, so they wouldn't know if the result was normal or abnormal
##
## <30 Not<30
## >7 -2.16 0.68
## None 1.04 -0.33
## Norm -0.69 0.22
Metformin VS Readmitted
table(db.expl$metformin, db.expl$readmitted)
##
## <30 Not<30
## Down 44 391
## No 5040 50030
## Steady 1135 12499
## Up 58 776
met_readm.chisq <- chisq.test(table(db.expl$metformin, db.expl$readmitted))
met_readm.chisq # The results are highly significant (p=0.002862), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$metformin, db.expl$readmitted)
## X-squared = 14.032, df = 3, p-value = 0.002862
percent_contrib(met_readm.chisq) # Steady and <30 has the highest percent contribution (45.2%) with Up and <30 (26.9%) following
##
## <30 Not<30
## Down 4.525 0.446
## No 14.394 1.418
## Steady 45.177 4.452
## Up 26.933 2.654
round(met_readm.chisq$residuals,2) # There are negative associations between Steady and <30 and Up and <30. This indicates that in the group of patients who are readmitted within 30 days, there are fewer cases than expected where they were on metformin with no change in dosage, and where their metformin dosages were increased.
##
## <30 Not<30
## Down 0.80 -0.25
## No 1.42 -0.45
## Steady -2.52 0.79
## Up -1.94 0.61
Glimepiride VS Readmitted
table(db.expl$glimepiride, db.expl$readmitted)
##
## <30 Not<30
## Down 18 118
## No 5953 60323
## Steady 283 3048
## Up 23 207
glim_readm.chisq <- chisq.test(table(db.expl$glimepiride, db.expl$readmitted))
glim_readm.chisq # The findings are not statistically significant (p=0.235), indicating that we cannot reject the null hypothesis that the two variables are independent
##
## Pearson's Chi-squared test
##
## data: table(db.expl$glimepiride, db.expl$readmitted)
## X-squared = 4.2574, df = 3, p-value = 0.235
Glipizide VS Readmitted
table(db.expl$glipizide, db.expl$readmitted)
##
## <30 Not<30
## Down 48 323
## No 5400 55566
## Steady 766 7297
## Up 63 510
glip_readm.chisq <- chisq.test(table(db.expl$glipizide, db.expl$readmitted))
glip_readm.chisq # The results are highly significant (p=0.003262), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$glipizide, db.expl$readmitted)
## X-squared = 13.752, df = 3, p-value = 0.003262
percent_contrib(glip_readm.chisq) # Down and <30 has the highest percent contribution (47.3%), with Up, <30 (19.0%) and Steady, <30 (18.3%) following
##
## <30 Not<30
## Down 47.336 4.665
## No 6.333 0.624
## Steady 18.330 1.806
## Up 19.030 1.875
round(glip_readm.chisq$residuals,2) # There are positive associations between each Down, Steady and Up with <30. This indicates there are more people than expected in the readmitted group who are on glipizide with or without a dosage change. It is noteworthy that glipizide, glimepiride and glyburide are all part of the sulfonylurea class of medications, which are associated with increased risk of cardiovascular death as well as episodes of severe hypoglycemia
##
## <30 Not<30
## Down 2.55 -0.80
## No -0.93 0.29
## Steady 1.59 -0.50
## Up 1.62 -0.51
Glyburide VS Readmitted
table(db.expl$glyburide, db.expl$readmitted)
##
## <30 Not<30
## Down 40 378
## No 5548 56650
## Steady 630 6114
## Up 59 554
gly_readm.chisq <- chisq.test(table(db.expl$glyburide, db.expl$readmitted))
gly_readm.chisq # The results are not statistically significant (p=0.6068), indicating we fail to reject the null hypothesis that the variables are independent
##
## Pearson's Chi-squared test
##
## data: table(db.expl$glyburide, db.expl$readmitted)
## X-squared = 1.8376, df = 3, p-value = 0.6068
Pioglitazone VS Readmitted
table(db.expl$pioglitazone, db.expl$readmitted)
##
## <30 Not<30
## Down 9 72
## No 5818 58892
## Steady 429 4575
## Up 21 157
pio_readm.chisq <- chisq.test(table(db.expl$pioglitazone, db.expl$readmitted))
pio_readm.chisq # The results are not statistically significant (p=0.3622), indicating we fail to reject the null hypothesis that the variables are independent
##
## Pearson's Chi-squared test
##
## data: table(db.expl$pioglitazone, db.expl$readmitted)
## X-squared = 3.1974, df = 3, p-value = 0.3622
Rosiglitazone VS Readmitted
table(db.expl$rosiglitazone, db.expl$readmitted)
##
## <30 Not<30
## Down 4 70
## No 5867 59445
## Steady 393 4062
## Up 13 119
rosi_readm.chisq <- chisq.test(table(db.expl$rosiglitazone, db.expl$readmitted))
rosi_readm.chisq # The results are not statistically significant (p=0.7032), indicating we fail to reject the null hypothesis that the variables are independent. Of note, pioglitazone and rosiglitazone come from the same class of medications (glitazones)
##
## Pearson's Chi-squared test
##
## data: table(db.expl$rosiglitazone, db.expl$readmitted)
## X-squared = 1.41, df = 3, p-value = 0.7032
Insulin VS Readmitted
table(db.expl$insulin, db.expl$readmitted)
##
## <30 Not<30
## Down 772 6549
## No 2837 31421
## Steady 2001 19616
## Up 667 6110
insulin_readm.chisq <- chisq.test(table(db.expl$insulin, db.expl$readmitted))
insulin_readm.chisq # The results are highly significant (p=5.876e-11), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$insulin, db.expl$readmitted)
## X-squared = 50.626, df = 3, p-value = 5.876e-11
percent_contrib(insulin_readm.chisq) # Down, <30 has the highest percent contribution (40.0%), with No, <30 (35.8%) following
##
## <30 Not<30
## Down 39.958 3.938
## No 35.844 3.532
## Steady 3.893 0.384
## Up 11.334 1.117
round(insulin_readm.chisq$residuals,2) # There is a positive association between Down and <30, and a negative association between No and <30. This indicates that patients who were readmitted had more cases than expected where they were taking insulin and the dosage was decreased, and fewer patients who were not on insulin than expected. Insulin is a medication requiring close daily monitoring of blood sugar levels. A reduction in dosage without proper follow-up and monitoring could lead to hyperglycemia and eventually hospitalization for complications (e.g., diabetic ketoacidosis)
##
## <30 Not<30
## Down 4.50 -1.41
## No -4.26 1.34
## Steady 1.40 -0.44
## Up 2.40 -0.75
Change VS Readmitted
table(db.expl$change, db.expl$readmitted)
##
## <30 Not<30
## Ch 2971 28520
## No 3306 35176
ch_readm.chisq <- chisq.test(table(db.expl$change, db.expl$readmitted))
ch_readm.chisq # The results are highly significant (p=0.0001085), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(db.expl$change, db.expl$readmitted)
## X-squared = 14.983, df = 1, p-value = 0.0001085
percent_contrib(ch_readm.chisq) # Ch, <30 has the highest percent contribution (50.4%), with No, <30 (41.3%) following
##
## <30 Not<30
## Ch 50.407 4.967
## No 41.249 4.065
round(ch_readm.chisq$residuals,2) # There is a positive association between Ch and <30, and a negative association between No and <30. This indicates that in the <30 day readmitted group, there were significantly more people than expected who had a change in diabetic medications in hospital, and fewer than expected who did not have a change. One reason for this could be that there may not be adequate follow-up relating to changes in diabetic medications, leading to complications and re-hospitalization
##
## <30 Not<30
## Ch 2.75 -0.86
## No -2.49 0.78
Diabetes Meds VS Readmitted
table(db.expl$diabetesMed, db.expl$readmitted)
##
## <30 Not<30
## No 1259 15421
## Yes 5018 48275
dbm_readm.chisq <- chisq.test(table(db.expl$diabetesMed, db.expl$readmitted))
dbm_readm.chisq # The results are highly significant (p=1.953e-13), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(db.expl$diabetesMed, db.expl$readmitted)
## X-squared = 54.052, df = 1, p-value = 1.953e-13
percent_contrib(dbm_readm.chisq) # No, <30 has the highest percent contribution (69.6%), with Yes, <30 (22.0%) following
##
## <30 Not<30
## No 69.623 6.861
## Yes 21.791 2.147
round(dbm_readm.chisq$residuals,2) # There is a negative association between No diabetes meds and <30, with a positive association between Yes diabetes meds and <30. This indicates that of the people who were readmitted, fewer than expected are not on diabetic medications, and more than expected are on diabetic medications. These results are expected, as a lack of diabetic medications for diabetes indicates the individual is earlier in the disease progression and therefore their condition is less severe. Diabetic medications, if not monitored appropriately, can lead to complications such as hypo- and hyperglycemia, sometimes requiring hospitalization
##
## <30 Not<30
## No -6.13 1.93
## Yes 3.43 -1.08
Due to the above finding that patients in the readmitted group were less likely to have an abnormal HbA1c result, I’d like to explore whether there is a connection between an abnormal HbA1c Result and there being a potentially resulting change in diabetic medication during stay.
A1c Result VS Change in Medication
table(db.expl$A1Cresult, db.expl$change)
##
## Ch No
## >7 5473 3631
## None 24410 32718
## Norm 1608 2133
A1C_change.chisq <- chisq.test(table(db.expl$A1Cresult, db.expl$change))
A1C_change.chisq # The results are highly significant (p<2.2e-16), indicating that there is an association between the two variables
##
## Pearson's Chi-squared test
##
## data: table(db.expl$A1Cresult, db.expl$change)
## X-squared = 965.75, df = 2, p-value < 2.2e-16
percent_contrib(A1C_change.chisq) # >7 (abnormal HbA1c Result) and Change in medication has the highest percent contribution (47.8%), with >7 and No change in medication (39.2%) following
##
## Ch No
## >7 47.836 39.145
## None 6.808 5.571
## Norm 0.352 0.288
round(A1C_change.chisq$residuals,2) # There are more patients than expected in the group of those with an abnormal HbA1c and a Change in medication, and fewer patients than expected in the group of those with an abnormal HbA1c and No change. This supports the idea that an abnormal HbA1c finding may contribute to a change in diabetic medication. However, more focused study would need to be completed to identify if this relationship may be causal
##
## Ch No
## >7 21.49 -19.44
## None -8.11 7.34
## Norm -1.84 1.67
Given that we found there was a positive association between a reduction in insulin dosage and being readmitted within 30 days, I would like compare this to discharge disposition ID as well in the subset of patients who received a reduction in insulin dosage.
Reduction in Insulin VS Readmitted VS Discharge
insDown <- db.expl[db.expl$insulin == 'Down',] # Create subset
table(insDown$discharge_disposition_id, insDown$readmitted)
##
## <30 Not<30
## DcHome 392 4347
## DcOtherFacility 122 378
## DcSNF 149 896
## DcHomeWithHomeService 101 883
## Other 8 45
ggplot(insDown, aes(x=fct_infreq(discharge_disposition_id), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'Discharge Disposition ID', y = 'Count', title = 'Discharge and Readmission Status for Patients with Insulin Reduction') + theme(axis.text.x = element_text(angle = 90, hjust = 1))
notinsDown <- db.expl[db.expl$insulin != 'Down',]
ggplot(notinsDown, aes(x=fct_infreq(discharge_disposition_id), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'Discharge Disposition ID', y = 'Count', title = 'Discharge and Readmission Status for Patients Without Insulin Reduction') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Subset with insulin reduction appears to have slightly higher proportions of not readmitted in every category except for DcHome
insDown_dcid_readm.chisq <- chisq.test(table(insDown$discharge_disposition_id, insDown$readmitted))
insDown_dcid_readm.chisq # The chi-squared results are highly significant (p<2.2e-16) that there is an association between readmitted and discharge_disposition_id within the subset of patients who had a decrease in insulin dosage
##
## Pearson's Chi-squared test
##
## data: table(insDown$discharge_disposition_id, insDown$readmitted)
## X-squared = 144.23, df = 4, p-value < 2.2e-16
percent_contrib(insDown_dcid_readm.chisq) # DcOtherFacility, <30 has the highest percent contribution (63.1%) with DcHome (16.1%) following
##
## <30 Not<30
## DcHome 16.101 1.898
## DcOtherFacility 63.107 7.439
## DcSNF 9.474 1.117
## DcHomeWithHomeService 0.051 0.006
## Other 0.721 0.085
round(insDown_dcid_readm.chisq$residuals,2) # There is a positive association between DcOtherFacility and <30, and a negative association between DcHome and <30. This indicates that in the <30 group, more patients than expected were DcOtherFacility, and fewer than expected were discharged home. These findings are similar to the bivariate comparison of Discharge Disposition ID and Readmitted in the whole dataset, although in this subset, the DcOtherFacility, <30 group has a higher percent contribution (63.1% vs 48.8%), suggesting that with an insulin dosage reduction, patients may be even more likely to be discharged to another facility in the readmitted group
##
## <30 Not<30
## DcHome -4.82 1.65
## DcOtherFacility 9.54 -3.28
## DcSNF 3.70 -1.27
## DcHomeWithHomeService -0.27 0.09
## Other 1.02 -0.35
On another note, given that there were positive associations between medication change and readmission, and medication change and an abnormal A1C result, but a negative association between abnormal A1C and readmission, I would like to further explore whether within the group of people with a change in medication there is still a negative association between abnormal A1C and readmission.
Readmitted <30 VS A1C Result VS Change in Medication
ch <- db.expl[db.expl$change == 'Ch',] # Create subset
table(ch$A1Cresult, ch$readmitted)
##
## <30 Not<30
## >7 473 5000
## None 2359 22051
## Norm 139 1469
ggplot(ch, aes(x=fct_infreq(A1Cresult), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'A1C Result', y = 'Count', title = 'A1C and Readmission Status for Patients Who Had a Med Change') + theme(axis.text.x = element_text(angle = 90, hjust = 1))
noch <- db.expl[db.expl$change != 'Ch',]
ggplot(noch, aes(fct_infreq(A1Cresult), fill = readmitted)) + geom_bar() + theme_bw() + labs(x = 'A1C Result', y = 'Count', title = 'A1C and Readmission Status for Patients with No Med Change') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # There appears to be a higher proportion of patients with an abnormal A1C (>7) who were not readmitted in the change in medication group
A1C_readm_ch.chisq <- chisq.test(table(ch$A1Cresult, ch$readmitted))
A1C_readm_ch.chisq # The chi-squared results are significant at the alpha=0.05 level (p=0.03509), indicating that there is an association between A1C result and readmitted within the change in medication group
##
## Pearson's Chi-squared test
##
## data: table(ch$A1Cresult, ch$readmitted)
## X-squared = 6.6995, df = 2, p-value = 0.03509
percent_contrib(A1C_readm_ch.chisq) # >7, <30 has the highest pecent contribution (54.3%)
##
## <30 Not<30
## >7 54.317 5.658
## None 20.364 2.121
## Norm 15.884 1.655
round(A1C_readm_ch.chisq$residuals,2) # There is still a negative association between >7 and <30, indicating that even within the group of patients who had a change in medication, there were fewer patients than expected who had an abnormal A1C result and were readmitted. However, the percent contribution is lower than that of >7, <30 for the greater sample (68.3%)
##
## <30 Not<30
## >7 -1.91 0.62
## None 1.17 -0.38
## Norm -1.03 0.33
In order to use the data for modeling, we will:
These transformations will be put in a new dataframe object simply entitled “db”.
First we will define a function to complete the min-max scaling on numeric attributes.
minmax <- function(x) {
return((x-min(x))/(max(x)-min(x)))
} # All resulting values will be between 0 and 1
Numeric Attributes
db <- db.expl # Create new dataframe entitled "db" to house data transformed for modeling
db[,1:8] <- lapply(db.expl[,1:8], minmax) # Apply min-max scaling to all numeric attributes and save them to the new dataframe
Ordinal Attribute
db$age <- minmax(as.numeric(db.expl$age)) # Convert Age to its ordered numeric IDs (1-10), and then scale the values
Categorical Attributes
# First convert categorical variables with two categories (gender, change, diabetesMed, readmitted) to 0 and 1 so that they don't get converted into two variables each with the one-hot encoding
db$gender <- as.numeric(db.expl$gender)
db$gender[db$gender == 1] <- 0 # 0 will be female
db$gender[db$gender == 2] <- 1 # 1 will be male
db$change <- as.numeric(db.expl$change)
db$change[db$change == 2] <- 0 # 1 will be change ('Ch'), 0 will be no change ('No')
db$diabetesMed <- as.numeric(db.expl$diabetesMed)
db$diabetesMed[db$diabetesMed == 1] <- 0 # 0 will be No
db$diabetesMed[db$diabetesMed == 2] <- 1 # 1 will be Yes
db$readmitted <- as.numeric(db.expl$readmitted)
db$readmitted[db$readmitted == 2] <- 0 # Not<30 will be 0, while <30 will be 1
# Now one-hot encode the remaining categorical variables (factor type)
db <- one_hot(as.data.table(db)) # Replaced 19 categorical variables with 77 binary dummy variables
Now we may review the final structure of the dataset to be modeled, and export it.
str(db) # There are now a total of 82 variables, all between 0 and 1
## Classes 'data.table' and 'data.frame': 69973 obs. of 82 variables:
## $ time_in_hospital : num 0.923 0.846 0 0.615 0.154 ...
## $ num_lab_procedures : num 0.511 0.244 0.382 0.351 0.229 ...
## $ num_procedures : num 0.333 0.5 0 0.333 1 ...
## $ num_medications : num 0.3375 0.2125 0.0875 0.2 0.1875 ...
## $ number_outpatient : num 0 0 0 0 0 0 0 0 0 0 ...
## $ number_emergency : num 0 0 0 0 0 ...
## $ number_inpatient : num 0 0 0 0 0 0 0 0 0 0 ...
## $ number_diagnoses : num 0.467 0.467 0.267 0.533 0.533 ...
## $ age : num 0.889 1 0.444 0.444 0.556 ...
## $ A1Cresult_>7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ A1Cresult_None : int 1 1 1 1 1 1 1 1 1 1 ...
## $ A1Cresult_Norm : int 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Caucasian : int 1 1 1 0 1 0 1 1 1 0 ...
## $ race_AfricanAmerican : int 0 0 0 1 0 1 0 0 0 1 ...
## $ race_Asian : int 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Hispanic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ race_Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ gender : num 0 0 1 0 1 1 0 1 1 0 ...
## $ admission_type_id_Emergency : int 0 0 1 1 0 0 1 1 0 0 ...
## $ admission_type_id_Urgent : int 1 0 0 0 1 1 0 0 0 0 ...
## $ admission_type_id_Elective : int 0 1 0 0 0 0 0 0 1 1 ...
## $ admission_type_id_Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id_DcHome : int 1 0 1 1 1 1 0 0 1 1 ...
## $ discharge_disposition_id_DcOtherFacility : int 0 0 0 0 0 0 0 0 0 0 ...
## $ discharge_disposition_id_DcSNF : int 0 1 0 0 0 0 1 0 0 0 ...
## $ discharge_disposition_id_DcHomeWithHomeService: int 0 0 0 0 0 0 0 1 0 0 ...
## $ discharge_disposition_id_Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id_PhysRef : int 0 0 0 0 0 0 0 0 0 0 ...
## $ admission_source_id_TransferExtFacility : int 1 1 0 0 1 1 0 0 1 1 ...
## $ admission_source_id_ER : int 0 0 1 1 0 0 1 1 0 0 ...
## $ admission_source_id_Other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1_390-459 : int 1 1 0 0 1 0 1 1 1 0 ...
## $ diag_1_Other : int 0 0 1 0 0 1 0 0 0 0 ...
## $ diag_1_250.0-250.9 : int 0 0 0 1 0 0 0 0 0 0 ...
## $ diag_1_460-519 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ diag_1_520-579 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1_710-739 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1_780-799 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_1_800-999 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_2_390-459 : int 1 0 0 1 1 0 0 1 1 0 ...
## $ diag_2_Other : int 0 1 1 0 0 1 0 0 0 1 ...
## $ diag_2_240-279(not250) : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_2_250.0-250.9 : int 0 0 0 0 0 0 1 0 0 0 ...
## $ diag_2_460-519 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_2_580-629 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_3_390-459 : int 0 0 0 0 0 0 0 1 0 0 ...
## $ diag_3_Other : int 1 0 0 1 0 1 0 0 1 0 ...
## $ diag_3_240-279(not250) : int 0 0 0 0 0 0 0 0 0 0 ...
## $ diag_3_250.0-250.9 : int 0 0 1 0 1 0 1 0 0 0 ...
## $ diag_3_460-519 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ diag_3_580-629 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ metformin_Down : int 0 0 0 0 0 0 0 0 0 0 ...
## $ metformin_No : int 1 1 1 1 1 1 0 1 0 1 ...
## $ metformin_Steady : int 0 0 0 0 0 0 1 0 1 0 ...
## $ metformin_Up : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glimepiride_Down : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glimepiride_No : int 1 1 1 1 1 1 1 1 0 1 ...
## $ glimepiride_Steady : int 0 0 0 0 0 0 0 0 1 0 ...
## $ glimepiride_Up : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glipizide_Down : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glipizide_No : int 0 1 0 1 1 1 1 1 1 1 ...
## $ glipizide_Steady : int 1 0 1 0 0 0 0 0 0 0 ...
## $ glipizide_Up : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glyburide_Down : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glyburide_No : int 1 1 1 1 1 0 1 1 1 1 ...
## $ glyburide_Steady : int 0 0 0 0 0 0 0 0 0 0 ...
## $ glyburide_Up : int 0 0 0 0 0 1 0 0 0 0 ...
## $ pioglitazone_Down : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pioglitazone_No : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pioglitazone_Steady : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pioglitazone_Up : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rosiglitazone_Down : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rosiglitazone_No : int 1 0 1 1 1 1 1 1 1 1 ...
## $ rosiglitazone_Steady : int 0 1 0 0 0 0 0 0 0 0 ...
## $ rosiglitazone_Up : int 0 0 0 0 0 0 0 0 0 0 ...
## $ insulin_Down : int 0 0 0 0 0 0 1 0 0 0 ...
## $ insulin_No : int 0 0 0 0 0 0 0 0 0 0 ...
## $ insulin_Steady : int 1 1 1 1 1 1 0 1 1 1 ...
## $ insulin_Up : int 0 0 0 0 0 0 0 0 0 0 ...
## $ change : num 1 1 1 0 0 1 1 0 1 0 ...
## $ diabetesMed : num 1 1 1 1 1 1 1 1 1 1 ...
## $ readmitted : num 0 0 0 0 0 1 1 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
#write.csv(db, '/Users/amyhowe/Desktop/Dataset_Modeling.csv', row.names = F) # Export dataset for modeling